diff --git a/BAMixChecker.py b/BAMixChecker.py index 33a5450..6788ece 100644 --- a/BAMixChecker.py +++ b/BAMixChecker.py @@ -1,9 +1,9 @@ -#!/usr/bin/env python2.7 +#!/usr/bin/env python3 """ - BAMixChecker version 1.0.1 + BAMixChecker version 2.0 Author : Hein Chun (heinc1010@gmail.com) """ -__version__ = '1.0.1' +__version__ = '2.0' try: import sys @@ -11,18 +11,17 @@ import re import time import copy - import commands + import subprocess import argparse import numpy as np - from math import log, e - from os.path import isfile, join, abspath - from subprocess import Popen,PIPE + from os.path import isfile, abspath + from subprocess import Popen, PIPE from scipy.stats import entropy - from multiprocessing import Process, Array, Pool + from multiprocessing import Process, Array, Pool, active_children except ImportError: - sys.exit("## ERROR: Required python package or packages are not installed.\n \ - Check and install the package or packages with 'pip install'.\ - - Required python packages : numpy, scipy.stats, multiprocessing") + sys.exit("## ERROR: Required python package or packages are not installed.\n" + " Check and install the package or packages with 'pip install'." + " - Required python packages : numpy, scipy.stats, multiprocessing") class VCFinfo: def __init__(self,file_path): @@ -66,9 +65,9 @@ def get_file_list(dir_path,user_file_list,FullPATH,flag_FNM): for file_path in lis_files_whole: tmp_file_type = file_path.split(".")[-1].upper() if tmp_file_type not in ['BAM','CRAM','SAM']: - print "## ERROR: Input files in the list are needed to be .bam or .cram file" - print "## "+file_path+ " is not .bam or .cram." - exit() + print("## ERROR: Input files in the list are needed to be .bam or .cram file") + print("## "+file_path+ " is not .bam or .cram.") + sys.exit(1) else: lis_files_in_dir = [ dir_path+f for f in os.listdir(dir_path) if isfile("/".join([dir_path, f])) ] for file_path in lis_files_in_dir: @@ -98,38 +97,41 @@ def get_file_list(dir_path,user_file_list,FullPATH,flag_FNM): def make_bed( total_SNP_bed, targeted_bed , OUTDIR, bedtools_path ): os.system("{0} intersect -b {1} -a {2} | uniq > {3}targetSNPs.bed".format(bedtools_path,targeted_bed,total_SNP_bed,OUTDIR)) - (exitstatus, line_count) = commands.getstatusoutput("cat {0}targetSNPs.bed | wc -l".format(OUTDIR)) + (exitstatus, line_count) = subprocess.getstatusoutput("cat {0}targetSNPs.bed | wc -l".format(OUTDIR)) if int(line_count) < 200: if total_SNP_bed.split("/")[-1] not in ["gnomad_hg38_AF10.bed","gnomad_hg19_AF10.bed"]: os.system("rm {0}targetSNPs.bed".format(OUTDIR)) return None else: - print "## WARNING: The target size is too small, so the number of SNP sites to compare is under 200." - print "## The specificity could be lower with the small numbr of SNP loci." - print "Make an optimized list of SNPs to compare - 'targetSNPs.bed'\n", + print("## WARNING: The target size is too small, so the number of SNP sites to compare is under 200.") + print("## The specificity could be lower with the small numbr of SNP loci.") + print("Make an optimized list of SNPs to compare - 'targetSNPs.bed'") return OUTDIR + "targetSNPs.bed" else: - print "Make an optimized list of SNPs to compare - 'targetSNPs.bed'\n", + print("Make an optimized list of SNPs to compare - 'targetSNPs.bed'") return OUTDIR + "targetSNPs.bed" def run_p(cmm): - print cmm + print(cmm) try: prc= Popen(cmm, stdout=PIPE, shell=True, stderr=PIPE) stdoutput, stderr = prc.communicate() + # decode stderr/stdoutput if bytes + if isinstance(stderr, bytes): + stderr = stderr.decode('utf-8', errors='replace') + if isinstance(stdoutput, bytes): + stdoutput = stdoutput.decode('utf-8', errors='replace') if prc.returncode != 0: - print stderr - raise - exit() + print(stderr) + raise RuntimeError(stderr) except KeyboardInterrupt: - for p in multiprocessing.active_children(): + for p in active_children(): p.terminate() - exit() + sys.exit(1) def run_HC( lis_bam_files, OutputDIR, reference_file, bed_file, max_prc, HC_path, ploidy): - print "Calling the variants information with GATK HaplotypeCaller" - print "Max process : ", - print max_prc + print("Calling the variants information with GATK HaplotypeCaller") + print("Max process :", max_prc) max_prc = int(max_prc) vcf_file_path = OutputDIR + "HaplotypeCaller/" cmm = "mkdir -p {0}".format(vcf_file_path) @@ -147,19 +149,19 @@ def run_HC( lis_bam_files, OutputDIR, reference_file, bed_file, max_prc, HC_path pool.close() pool.join() except KeyboardInterrupt: - print "## ERROR: KeyboardInterrupt" + print("## ERROR: KeyboardInterrupt") pool.close() pool.terminate() pool.join() - exit() + sys.exit(1) except: pool.close() pool.terminate() pool.join() - print "## ERROR: Calling error with GATK HaplotyCaller." - print " Preperation for calling with GATK is instructed in gitHub 'https://github.com/heinc1010/BAMixChecker'." - print " Refer the instruction or given error message above from GATK." - exit() + print("## ERROR: Calling error with GATK HaplotyCaller.") + print(" Preperation for calling with GATK is instructed in gitHub 'https://github.com/heinc1010/BAMixChecker'.") + print(" Refer the instruction or given error message above from GATK.") + sys.exit(1) for f in lis_bam_files: tmp_f = f.split("/")[-1] @@ -352,20 +354,20 @@ def get_sw_pairs(lis_files,smp_pairs): if f1 in smp_pairs[f]: if f not in lis_m_f: dic_sw[f1] = [f] - lis_sw_keys = dic_sw.keys() + lis_sw_keys = list(dic_sw.keys()) lis_sw_keys.sort() for f1 in lis_sw_keys: dic_sw[f1].sort() for f2 in dic_sw[f1]: try: - if (f1 != f2) & (f1 in dic_sw[f2]): + if (f1 != f2) and (f1 in dic_sw[f2]): dic_sw[f2].remove(f1) - except: + except Exception: continue return dic_sw, dic_un_p def get_sw_pairs_ans(lis_files, smp_pairs, lis_ans): - lis_f1 = smp_pairs.keys() + lis_f1 = list(smp_pairs.keys()) lis_f1.sort() dic_sw = {} dic_un_p = {} @@ -388,7 +390,7 @@ def get_sw_pairs_ans(lis_files, smp_pairs, lis_ans): if f2 not in g_f: dic_sw[f1].append(f2) - lis_sw_keys = dic_sw.keys() + lis_sw_keys = list(dic_sw.keys()) lis_sw_keys.sort() for f1 in lis_sw_keys: dic_sw[f1].sort() @@ -399,7 +401,7 @@ def get_sw_pairs_ans(lis_files, smp_pairs, lis_ans): return dic_sw, dic_un_p def make_result_file_no_file_name_info(cor_matrix,smp_pairs,lis_files,OutputDIR,lis_paired_files): - print " Skip making 'Mismatched_samples.txt' file" + print(" Skip making 'Mismatched_samples.txt' file") fw_m_m = open(OutputDIR+"Matched_samples.txt","w") fw_m_m.write("# Matched pair by genotype\n") lis_done = [] @@ -425,15 +427,15 @@ def make_result_file_no_file_name_info(cor_matrix,smp_pairs,lis_files,OutputDIR, def make_result_file(cor_matrix,smp_pairs,lis_files,OutputDIR,lis_ans,flag_FNM): return_v = 1 fw_a_m = open(OutputDIR+"Total_result.txt","w") - lis_paired_files = smp_pairs.keys() + lis_paired_files = list(smp_pairs.keys()) lis_paired_files.sort() len_v = len(lis_files) lis_sw = [] lis_up = [] lis_m = [] - if ( lis_ans == [] ) & (len(lis_files) < 6) : - print "## WARNING : The number of files is not enough to pair by file names." - print " Pairing samples only by genotype." + if ( lis_ans == [] ) and (len(lis_files) < 6) : + print("## WARNING : The number of files is not enough to pair by file names.") + print(" Pairing samples only by genotype.") make_result_file_no_file_name_info(cor_matrix,smp_pairs,lis_files,OutputDIR,lis_paired_files) for i in range(0,len_v-1): for j in range(i+1, len_v): @@ -454,8 +456,8 @@ def make_result_file(cor_matrix,smp_pairs,lis_files,OutputDIR,lis_ans,flag_FNM): dic_un_p = {} if lis_ans == []: if flag_FNM == False: - print "## WARNING : --OFFFileNamePairing option is applied." - print " Pairing samples only by genotype." + print("## WARNING : --OFFFileNamePairing option is applied.") + print(" Pairing samples only by genotype.") make_result_file_no_file_name_info(cor_matrix,smp_pairs,lis_files,OutputDIR,lis_paired_files) return_v = 0 for i in range(0,len_v-1): @@ -470,9 +472,9 @@ def make_result_file(cor_matrix,smp_pairs,lis_files,OutputDIR,lis_ans,flag_FNM): mk_heat_map(OutputDIR,lis_files,cor_matrix) return return_v dic_sw, dic_un_p = get_sw_pairs(lis_files,smp_pairs) - if ( dic_sw == None ) & ( dic_un_p == None): - print "## WARNING : The file names don't have detectable common regulation." - print " Pairing samples only by genotype." + if ( dic_sw == None ) and ( dic_un_p == None): + print("## WARNING : The file names don't have detectable common regulation.") + print(" Pairing samples only by genotype.") make_result_file_no_file_name_info(cor_matrix,smp_pairs,lis_files,OutputDIR,lis_paired_files) return_v = 0 for i in range(0,len_v-1): @@ -487,7 +489,7 @@ def make_result_file(cor_matrix,smp_pairs,lis_files,OutputDIR,lis_ans,flag_FNM): mk_heat_map(OutputDIR,lis_files,cor_matrix) return return_v else: - print "Detected pairs by file names." + print("Detected pairs by file names.") else: dic_sw, dic_un_p = get_sw_pairs_ans(lis_files, smp_pairs, lis_ans) fw_m_m = open(OutputDIR+"Matched_samples.txt","w") @@ -515,7 +517,7 @@ def make_result_file(cor_matrix,smp_pairs,lis_files,OutputDIR,lis_ans,flag_FNM): if len(smp_pairs[f1]) > 1: pass for f2 in smp_pairs[f1]: - if (f2 not in dic_sw[f1]) & (f1 not in dic_sw[f2]): + if (f2 not in dic_sw[f1]) and (f1 not in dic_sw[f2]): lis_tmp=[f1,f2] lis_tmp.sort() if lis_tmp not in lis_done: @@ -533,7 +535,7 @@ def make_result_file(cor_matrix,smp_pairs,lis_files,OutputDIR,lis_ans,flag_FNM): fw_m_m.close() if not perfect_m: fw_s_m.write("# Matched samples only by genotype or file name but not by both\n") - lis_sw_keys = dic_sw.keys() + lis_sw_keys = list(dic_sw.keys()) lis_sw_keys.sort() for f1 in lis_sw_keys: for f2 in dic_sw[f1]: @@ -727,13 +729,17 @@ def mk_html_dic(OutputDIR,lis_m,lis_sw,lis_up): cmm = "Rscript {0} {1} {2}".format(os.path.dirname(os.path.realpath(__file__))+"/r_script.r",OutputDIR + "BAMixChecker_Report.Rmd", OutputDIR) prc= Popen(cmm, stdout=PIPE, shell=True, stderr=PIPE) stdoutput, stderr = prc.communicate() + if isinstance(stderr, bytes): + stderr = stderr.decode('utf-8', errors='replace') if prc.returncode != 0: - print stderr + print(stderr) cmm = "rm {0}".format(OutputDIR + "BAMixChecker_Report.Rmd") prc= Popen(cmm, stdout=PIPE, shell=True, stderr=PIPE) stdoutput, stderr = prc.communicate() + if isinstance(stderr, bytes): + stderr = stderr.decode('utf-8', errors='replace') if prc.returncode != 0: - print stderr + print(stderr) def mk_html_no_mismatched(OutputDIR, lis_m): fw_r = open(OutputDIR + "BAMixChecker_Report.Rmd","w") @@ -776,12 +782,16 @@ def mk_html_no_mismatched(OutputDIR, lis_m): prc= Popen(cmm, stdout=PIPE, shell=True, stderr=PIPE) stdoutput, stderr = prc.communicate() if prc.returncode != 0: - print stderr + if isinstance(stderr, bytes): + stderr = stderr.decode('utf-8', errors='replace') + print(stderr) cmm = "rm {0}".format(OutputDIR + "BAMixChecker_Report.Rmd") prc= Popen(cmm, stdout=PIPE, shell=True, stderr=PIPE) stdoutput, stderr = prc.communicate() if prc.returncode != 0: - print stderr + if isinstance(stderr, bytes): + stderr = stderr.decode('utf-8', errors='replace') + print(stderr) def mk_heat_map(OutputDIR,lis_files,cor_matrix): lis_files = [ "'"+f+"'" for f in lis_files] @@ -816,12 +826,16 @@ def mk_heat_map(OutputDIR,lis_files,cor_matrix): prc= Popen(cmm, stdout=PIPE, shell=True, stderr=PIPE) stdoutput, stderr = prc.communicate() if prc.returncode != 0: - print stderr + if isinstance(stderr, bytes): + stderr = stderr.decode('utf-8', errors='replace') + print(stderr) cmm = "rm {0}".format(OutputDIR + "BAMixChecker_Heatmap.R") prc= Popen(cmm, stdout=PIPE, shell=True, stderr=PIPE) stdoutput, stderr = prc.communicate() if prc.returncode != 0: - print stderr + if isinstance(stderr, bytes): + stderr = stderr.decode('utf-8', errors='replace') + print(stderr) if __name__ == "__main__": start_t = time.time() @@ -850,11 +864,11 @@ def mk_heat_map(OutputDIR,lis_files,cor_matrix): HC_path = line.split('=')[1].strip() fr_config.close() if bedtools_path == '': - print "## ERROR: bedtools path is not set in 'BAMixChecker.config' file." - exit() + print("## ERROR: bedtools path is not set in 'BAMixChecker.config' file.") + sys.exit(1) elif HC_path == "": - print "## ERROR: GATK path is not set in 'BAMixChecker.config' file." - exit() + print("## ERROR: GATK path is not set in 'BAMixChecker.config' file.") + sys.exit(1) # get the arguments @@ -864,13 +878,13 @@ def mk_heat_map(OutputDIR,lis_files,cor_matrix): args = parser.parse_args() if args.DIR == '': if args.List == '': - print "## ERROR: There is no information about input files. Use -d or -l option for the input file information." - exit() + print("## ERROR: There is no information about input files. Use -d or -l option for the input file information.") + sys.exit(1) else: if args.List != '': - print dir_path - print "## ERROR: Option -d and -l are exclusive. Try with one of the options.\n##\t Check 'https://github.com/heinc1010/BAMixChecker' for more information about the options of BAMixChecker." - exit() + print(dir_path) + print("## ERROR: Option -d and -l are exclusive. Try with one of the options.\n##\t Check 'https://github.com/heinc1010/BAMixChecker' for more information about the options of BAMixChecker.") + sys.exit(1) dir_path = abspath(args.DIR)+'/' if args.OFFFileNameMatching: @@ -883,7 +897,7 @@ def mk_heat_map(OutputDIR,lis_files,cor_matrix): cmm = "mkdir -p {0}".format(out_path) os.system(cmm) - print "Checking required R packages." + print("Checking required R packages.") install_check="'if(!(require(rmarkdown))){install.packages('rmarkdown')}\nif(!(require(ztable))){install.packages('ztable')}\nif(!(require(corrplot))){install.packages('corrplot')}'" cmm = "echo {0} > {1}".format(install_check,out_path + "R_packages_install_check.R") prc= Popen(cmm, stdout=PIPE, shell=True, stderr=PIPE) @@ -892,19 +906,19 @@ def mk_heat_map(OutputDIR,lis_files,cor_matrix): prc= Popen(cmm, stdout=PIPE, shell=True, stderr=PIPE) stdoutput, stderr = prc.communicate() if prc.returncode != 0: - print "WARNING: Requied R packages are not installed." - print " HTML file or Heatmap would not be created properly." - print " Recommand to install related R packages." - print " - Required R packages : rmarkdown, ztable, corrplot." + print("WARNING: Requied R packages are not installed.") + print(" HTML file or Heatmap would not be created properly.") + print(" Recommand to install related R packages.") + print(" - Required R packages : rmarkdown, ztable, corrplot.") # print stderr cmm = "rm {0}".format(out_path + "R_packages_install_check.R") prc= Popen(cmm, stdout=PIPE, shell=True, stderr=PIPE) if args.Ref == "": - print "## ERROR: Reference file is necessary. Use -r option." - print exit() + print("## ERROR: Reference file is necessary. Use -r option.") + sys.exit(1) flag_chr = False - (exitstatus, header) = commands.getstatusoutput("head -1 {0}".format(args.Ref)) + (exitstatus, header) = subprocess.getstatusoutput("head -1 {0}".format(args.Ref)) if header.startswith(">chr"): flag_chr = True if flag_chr: @@ -915,14 +929,14 @@ def mk_heat_map(OutputDIR,lis_files,cor_matrix): bed_file = None if args.RefVer in [ "hg38","hg19" ]: if args.BEDfile != '': - print "Run for targeted sequecing data" + print("Run for targeted sequecing data") if args.NonHumanSNPlist == "": bed_file = make_bed("{0}gnomad_{1}_AF{2}_AF{3}_All.bed".format(bed_file_path,args.RefVer,45,35), args.BEDfile , out_path, bedtools_path) for AF in range(45,5,-5): for AF_all in range(AF,-1,-10): if AF_all >= AF: continue - AF_all = int(AF_all/10)*10 + AF_all = (AF_all//10)*10 if AF_all != 0: bed_file = make_bed("{0}gnomad_{1}_AF{2}_AF{3}_All.bed".format(bed_file_path,args.RefVer,AF,AF_all), args.BEDfile , out_path, bedtools_path) else: @@ -934,20 +948,20 @@ def mk_heat_map(OutputDIR,lis_files,cor_matrix): else: bed_file = "{0}gnomad_{1}_AF45_AF35_All.bed".format(bed_file_path,args.RefVer) else: - print "## ERROR: Option -v should be 'hg19' or 'hg38'." - exit() + print("## ERROR: Option -v should be 'hg19' or 'hg38'.") + sys.exit(1) if args.NonHumanSNPlist != "": - print "Non human SNP list is given." - (exitstatus, line_count) = commands.getstatusoutput("ls {0}".format(args.NonHumanSNPlist)) + print("Non human SNP list is given.") + (exitstatus, line_count) = subprocess.getstatusoutput("ls {0}".format(args.NonHumanSNPlist)) if exitstatus != 0: - print "# ERROR: Fail to read " + args.NonHumanSNPlist - exit() + print("# ERROR: Fail to read " + args.NonHumanSNPlist) + sys.exit(1) if args.BEDfile != '': - (exitstatus, line_count) = commands.getstatusoutput("{0} intersect -b {1} -a {2} | wc -l".format(bedtools_path, args.BEDfile, args.NonHumanSNPlist)) + (exitstatus, line_count) = subprocess.getstatusoutput("{0} intersect -b {1} -a {2} | wc -l".format(bedtools_path, args.BEDfile, args.NonHumanSNPlist)) else: - (exitstatus, line_count) = commands.getstatusoutput("wc -l {0}".format(args.NonHumanSNPlist)) - print line_count +" SNP loci will be compared." + (exitstatus, line_count) = subprocess.getstatusoutput("wc -l {0}".format(args.NonHumanSNPlist)) + print(str(line_count) +" SNP loci will be compared.") bed_file = args.NonHumanSNPlist if args.FullPATH: @@ -956,8 +970,8 @@ def mk_heat_map(OutputDIR,lis_files,cor_matrix): lis_bam_files,lis_ans, lis_bam_full_path = get_file_list(dir_path, args.List,False, flag_FNM) if len(lis_bam_files) == 0: - print "## ERROR: No .bam file in the list or directory." - exit() + print("## ERROR: No .bam file in the list or directory.") + sys.exit(1) # call the variants @@ -974,13 +988,13 @@ def mk_heat_map(OutputDIR,lis_files,cor_matrix): if result == 1: - print "Perfect match." + print("Perfect match.") elif result == 2: - print "Swapped file exist. Check 'BAMixChecker_Report.html' or 'Mismatched_samples.txt' file." + print("Swapped file exist. Check 'BAMixChecker_Report.html' or 'Mismatched_samples.txt' file.") if args.RemoveVCF: cmm = "rm -r {0}HaplotypeCaller/".format(out_path) os.system(cmm) end_t = time.time() - start_t - print "Running time: "+str(round(end_t/60,2))+" min" + print("Running time: "+str(round(end_t/60,2))+" min") diff --git a/README.md b/README.md index 0b3a031..2f9b97b 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Required tools #### Bedtools -#### Python 2.7 +#### Python 3 - scipy.stats - numpy