diff --git a/main/cds.fasta b/main/cds.fasta index d0ebd79..e277831 100644 --- a/main/cds.fasta +++ b/main/cds.fasta @@ -2,3 +2,5 @@ ATGGCGAAAGAAGATAACATTGAAATGCAGGGCACCGTGCTGGAAACCCTGCCGAACACCATGTTTCGCGTGGAACTGGAAAACGGCCATGTGGTGACCGCGCATATTAGCGGCAAAATGCGCAAAAACTATATTCGCATTCTGACCGGCGATAAAGTGACCGTGGAACTGACCCCGTATGATCTGAGCAAAGGCCGCATTGTGTTTCGCAGCCGCTAA >aroB ATGGAGAGGATAGTGGTCACCCTGGGTGAGCGGTCTTATCCAATCACGATCGCTTCGGGGTTATTTAATGAGCCTGCTTCGTTCCTGCCGCTGAAATCGGGGGAGCAGGTAATGCTCGTGACAAATGAGACACTGGCCCCCCTGTATTTGGATAAAGTTAGGGGTGTCTTGGAGCAGGCAGGTGTGAACGTCGATAGTGTCATCCTGCCTGATGGCGAGCAGTATAAATCTTTAGCGGTGCTGGACACCGTGTTCACCGCACTGCTACAAAAACCGCATGGGCGCGACACTACTTTAGTCGCACTGGGGGGCGGTGTCGTTGGTGATCTGACAGGCTTCGCGGCCGCGAGTTATCAACGGGGCGTGCGGTTTATTCAGGTTCCGACCACTCTACTGAGTCAAGTTGATTCTAGTGTAGGCGGCAAAACGGCAGTCAACCATCCATTGGGCAAAAATATGATTGGTGCGTTCTACCAACCAGCTTCGGTCGTCGTGGATCTGGATTGCCTGAAGACGCTGCCGCCCCGGGAACTAGCAAGTGGTCTGGCTGAGGTTATCAAATATGGCATCATTCTGGACGGCGCATTCTTTAACTGGCTTGAAGAGAATCTGGATGCCCTACTAAGACTGGACGGTCCAGCTATGGCCTATTGTATTAGACGGTGCTGTGAGCTAAAAGCCGAGGTTGTTGCTGCGGACGAACGAGAGACTGGTCTACGTGCTTTGTTGAATCTAGGTCACACGTTTGGTCACGCCATCGAAGCCGAAATGGGTTATGGAAATTGGCTTCACGGGGAGGCGGTGGCAGCGGGGATGGTCATGGCCGCACGGACATCTGAGCGGCTGGGACAATTCTCGAGTGCCGAGACTCAAAGAATCATCACACTGTTAAAACGGGCAGGCCTGCCAGTTAATGGTCCAAGGGAAATGTCGGCCCAAGCCTACTTGCCCCACATGCTGCGGGACAAGAAAGTTCTAGCAGGGGAAATGAGGTTGATCCTTCCTCTGGCCATTGGGAAAAGTGAAGTGAGATCCGGTGTCTCGCATGAACTAGTTCTTAACGCGATAGCTGACTGTCAATCAGCGTAA +>cysJ +ATGACCACCCAGGTGCCGCCGAGCGCGCTGCTGCCGCTGAACCCGGAACAGCTGGCGCGCCTGCAGGCGGCGACCACCGATCTGACCCCGACCCAGCTGGCGTGGGTGAGCGGCTATTTTTGGGGCGTGCTGAACCAGCAGCCGGCGGCGCTGGCGGCGACCCCGGCGCCGGCGGCGGAAATGCCGGGCATTACCATTATTAGCGCGAGCCAGACCGGCAACGCGCGCCGCGTGGCGGAAGCGCTGCGCGATGATCTGCTGGCGGCGAAACTGAACGTGAAACTGGTGAACGCGGGCGATTATAAATTTAAACAGATTGCGAGCGAAAAACTGCTGATTGTGGTGACCAGCACCCAGGGCGAAGGCGAACCGCCGGAAGAAGCGGTGGCGCTGCATAAATTTCTGTTTAGCAAAAAAGCGCCGAAACTGGAAAACACCGCGTTTGCGGTGTTTAGCCTGGGCGATAGCAGCTATGAATTTTTTTGCCAGAGCGGCAAAGATTTTGATAGCAAACTGGCGGAACTGGGCGGCGAACGCCTGCTGGATCGCGTGGATGCGGATGTGGAATATCAGGCGGCGGCGAGCGAATGGCGCGCGCGCGTGGTGGATGCGCTGAAAAGCCGCGCGCCGGTGGCGGCGCCGAGCCAGAGCGTGGCGACCGGCGCGGTGAACGAAATTCATACCAGCCCGTATAGCAAAGATGCGCCGCTGGTGGCGAGCCTGAGCGTGAACCAGAAAATTACCGGCCGCAACAGCGAAAAAGATGTGCGCCATATTGAAATTGATCTGGGCGATAGCGGCATGCGCTATCAGCCGGGCGATGCGCTGGGCGTGTGGTATCAGAACGATCCGGCGCTGGTGAAAGAACTGGTGGAACTGCTGTGGCTGAAAGGCGATGAACCGGTGACCGTGGAAGGCAAAACCCTGCCGCTGAACGAAGCGCTGCAGTGGCATTTTGAACTGACCGTGAACACCGCGAACATTGTGGAAAACTATGCGACCCTGACCCGCAGCGAAACCCTGCTGCCGCTGGTGGGCGATAAAGCGAAACTGCAGCATTATGCGGCGACCACCCCGATTGTGGATATGGTGCGCTTTAGCCCGGCGCAGCTGGATGCGGAAGCGCTGATTAACCTGCTGCGCCCGCTGACCCCGCGCCTGTATAGCATTGCGAGCAGCCAGGCGGAAGTGGAAAACGAAGTGCATGTGACCGTGGGCGTGGTGCGCTATGATGTGGAAGGCCGCGCGCGCGCGGGCGGCGCGAGCAGCTTTCTGGCGGATCGCGTGGAAGAAGAAGGCGAAGTGCGCGTGTTTATTGAACATAACGATAACTTTCGCCTGCCGGCGAACCCGGAAACCCCGGTGATTATGATTGGCCCGGGCACCGGCATTGCGCCGTTTCGCGCGTTTATGCAGCAGCGCGCGGCGGATGAAGCGCCGGGCAAAAACTGGCTGTTTTTTGGCAACCCGCATTTTACCGAAGATTTTCTGTATCAGGTGGAATGGCAGCGCTATGTGAAAGATGGCGTGCTGACCCGCATTGATCTGGCGTGGAGCCGCGATCAGAAAGAAAAAGTGTATGTGCAGGATAAACTGCGCGAACAGGGCGCGGAACTGTGGCGCTGGATTAACGATGGCGCGCATATTTATGTGTGCGGCGATGCGAACCGCATGGCGAAAGATGTGGAACAGGCGCTGCTGGAAGTGATTGCGGAATTTGGCGGCATGGATACCGAAGCGGCGGATGAATTTCTGAGCGAACTGCGCGTGGAACGCCGCTATCAGCGCGATGTGTAT diff --git a/main/main.jl b/main/main.jl index a80dcc4..39d5ca3 100644 --- a/main/main.jl +++ b/main/main.jl @@ -389,7 +389,7 @@ function run_file() the_out_path = "$out_dir/$(short)_$(long)_$frame/" if !(isdir(the_out_path)) - run(`mkdir $the_out_path`) + run(`mkdir -p $the_out_path`) end log_io = open("$out_dir/$(short)_$(long)_$frame/log_$(rand_barcode).txt", "w+") @@ -418,6 +418,6 @@ function run_file() end -run_file() +@time run_file() end diff --git a/main/outparser.jl b/main/outparser.jl new file mode 100644 index 0000000..47a495f --- /dev/null +++ b/main/outparser.jl @@ -0,0 +1,135 @@ +""" +outparser: CAMEOS output parser +""" + +using JLD # Should be placed here too to avoid later crash loading JLD file + +module main + +include("optimize.jl") +include("types.jl") + +using ArgParse, JLD + +function parse_commandline() + s = ArgParseSettings() + @add_arg_table s begin + "mark_gene" + help = "1st positional argument, name of gene 'mark'" + arg_type = String + default = "cysJ" + "deg_gene" + help = "2nd positional argument, name of gene 'deg'" + arg_type = String + default = "infA" + "runid" + help = "3rd positional argument, run id code created by CAMEOS" + arg_type = String + default = "LgolUs7k" + "--frame" + help = "optional argument, CAMEOS frame" + arg_type = String + default = "p1" + "--fasta" + help = "generate a FASTA file too" + action = :store_true + "--just-fullseq" + help = "the FASTA will only contain the full sequence" + action = :store_true + end + return parse_args(s) +end + +function outparse_cameos() + println("=-= CAMEOS output parser =-= v0.4 - Mar 2021 =-= by LLNL =-=") + + # Parse arguments + parsed_args = parse_commandline() + mark_gene = parsed_args["mark_gene"] + deg_gene = parsed_args["deg_gene"] + runid = parsed_args["runid"] + frame = parsed_args["frame"] + fasta = parsed_args["fasta"] + justfullseq = parsed_args["just-fullseq"] + + # Get complete path for input and output files + subdir = string(mark_gene, "_", deg_gene, "_", frame) + jld_file = string("saved_pop_", runid, ".jld") + csv_file = string("summary_", runid, ".csv") + fa_file = string("variants_", runid, ".fasta") + in_path = string("output/", subdir, "/", jld_file) + out_path = string("output/", subdir, "/", csv_file) + fa_path = string("output/", subdir, "/", fa_file) + + # Load JLD file + print("Loading data from JLD file ", in_path, " ... ") + variants = load(in_path)["variants"]; + println("OK!") + + # Save CSV file + print("Saving data to CSV file ", out_path, " : ") + parsed = 0 + open(out_path, "w") do io + println(io, string("full_seq,", + mark_gene,"_psls,", + mark_gene,"_seq,", + deg_gene,"_psls,", + deg_gene,"_seq")) + for var in variants + println(io, var.full_sequence,",", + var.mark_prob,",", + var.mark_seq,",", #var.mark_nuc,",", + var.deg_prob,",", + var.deg_seq) #,",",var.deg_nuc) + parsed += 1 + if parsed % 200 == 0 + print(".") + end + end + end + println(" OK!") + println(parsed, " variants parsed for ", runid) + + # Save fasta file + if fasta + print("Saving data to FASTA file ", fa_path, " : ") + variant = 1 + open(fa_path, "w") do io + + function print2fasta(var, field::String, nucs::Bool=false) + # Prints header and sequence to fasta file + println(io, ">CAMEOS run:", runid, + " mark_gene:", mark_gene, + " deg_gene:", deg_gene, + " variant:", variant, "/", parsed, + " ", string(field)) + if nucs + println(io, getfield(getfield(var, Symbol(field)), :nucs)) + else + println(io, getfield(var, Symbol(field))) + end + return nothing + end + + for var in variants + if justfullseq + print2fasta(var, "full_sequence") # equal to mark_nuc.nucs + else + print2fasta(var, "mark_seq") + print2fasta(var, "mark_nuc", true) + print2fasta(var, "deg_seq") + print2fasta(var, "deg_nuc", true) + end + variant += 1 + if variant % 200 == 0 + print(".") + end + end + end + println(" OK!") + println(variant-1, " variants in FASTA file for ", runid) + end +end + +@time outparse_cameos() +end diff --git a/main/proteins.fasta b/main/proteins.fasta index 7c52fdc..d983123 100644 --- a/main/proteins.fasta +++ b/main/proteins.fasta @@ -2,3 +2,5 @@ MAKEDNIEMQGTVLETLPNTMFRVELENGHVVTAHISGKMRKNYIRILTGDKVTVELTPYDLSKGRIVFRSR >aroB MERIVVTLGERSYPITIASGLFNEPASFLPLKSGEQVMLVTNETLAPLYLDKVRGVLEQAGVNVDSVILPDGEQYKSLAVLDTVFTALLQKPHGRDTTLVALGGGVVGDLTGFAAASYQRGVRFIQVPTTLLSQVDSSVGGKTAVNHPLGKNMIGAFYQPASVVVDLDCLKTLPPRELASGLAEVIKYGIILDGAFFNWLEENLDALLRLDGPAMAYCIRRCCELKAEVVAADERETGLRALLNLGHTFGHAIEAEMGYGNWLHGEAVAAGMVMAARTSERLGQFSSAETQRIITLLKRAGLPVNGPREMSAQAYLPHMLRDKKVLAGEMRLILPLAIGKSEVRSGVSHELVLNAIADCQSA +>cysJ +MTTQVPPSALLPLNPEQLARLQAATTDLTPTQLAWVSGYFWGVLNQQPAALAATPAPAAEMPGITIISASQTGNARRVAEALRDDLLAAKLNVKLVNAGDYKFKQIASEKLLIVVTSTQGEGEPPEEAVALHKFLFSKKAPKLENTAFAVFSLGDSSYEFFCQSGKDFDSKLAELGGERLLDRVDADVEYQAAASEWRARVVDALKSRAPVAAPSQSVATGAVNEIHTSPYSKDAPLVASLSVNQKITGRNSEKDVRHIEIDLGDSGMRYQPGDALGVWYQNDPALVKELVELLWLKGDEPVTVEGKTLPLNEALQWHFELTVNTANIVENYATLTRSETLLPLVGDKAKLQHYAATTPIVDMVRFSPAQLDAEALINLLRPLTPRLYSIASSQAEVENEVHVTVGVVRYDVEGRARAGGASSFLADRVEEEGEVRVFIEHNDNFRLPANPETPVIMIGPGTGIAPFRAFMQQRAADEAPGKNWLFFGNPHFTEDFLYQVEWQRYVKDGVLTRIDLAWSRDQKEKVYVQDKLREQGAELWRWINDGAHIYVCGDANRMAKDVEQALLEVIAEFGGMDTEAADEFLSELRVERRYQRDVY diff --git a/prepare/energies_and_psls.jl b/prepare/energies_and_psls.jl index 5fd9fa4..fe7e377 100644 --- a/prepare/energies_and_psls.jl +++ b/prepare/energies_and_psls.jl @@ -38,6 +38,44 @@ function compute_psls(prot_mat, w1, w2, nNodes, nProt) return -all_scores end +function save_energies(computed_energies, gene_name) + local out_file + try + out_file = open("../main/energies/energy_$(gene_name).txt", "w") + catch err + if isa(err, SystemError) + out_file = open("energy_$(gene_name).txt", "w") + println("Warning: writting energy output to working directory") + else + rethrow() + end + finally + for energy in computed_energies + write(out_file, "$energy\n") + end + close(out_file) + end +end + +function save_psls(computed_psls, gene_name) + local out_file + try + out_file = open("../main/psls/psls_$(gene_name).txt", "w") + catch err + if isa(err, SystemError) + out_file = open("psls_$(gene_name).txt", "w") + println("Warning: writting pseudolhood output to working directory") + else + rethrow() + end + finally + for psl in computed_psls + write(out_file, "$psl\n") + end + close(out_file) + end +end + function run(gene_name, jld_file, msa_file) println("Reading msa file") in_file = open(msa_file) @@ -63,22 +101,14 @@ function run(gene_name, jld_file, msa_file) w1 = mrf["w1"] w2 = mrf["w2"] - computed_energies = compute_energies(prot_mat, w1, w2) - computed_psls = compute_psls(prot_mat, w1, w2, num_aa, num_prot) + @time computed_energies = compute_energies(prot_mat, w1, w2) + @time computed_psls = compute_psls(prot_mat, w1, w2, num_aa, num_prot) #Write outputs to main directory. println("Done. Saving energies/psls.") - out_file = open("../main/energies/energy_$(gene_name).txt", "w") - for energy in computed_energies - write(out_file, "$energy\n") - end - close(out_file) - out_file = open("../main/psls/psls_$(gene_name).txt", "w") - for psl in computed_psls - write(out_file, "$psl\n") - end - close(out_file) + save_energies(computed_energies, gene_name) + save_psls(computed_psls, gene_name) end function main()