remove_extragene.py

#!/usr/bin/env python3
import os
import numpy as np
from tqdm import tqdm
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter

##this is to remove the extra copy of the genomic positions 266 to 13483 that appear in the sars-cov-2
##xmfa files (it appears twice in the ncbi gff because of the structure of ORF1ab)

def main():
    parser = ArgumentParser(
        formatter_class=ArgumentDefaultsHelpFormatter,
        description="split a .fasta.aln file generated by ViralMSA.py into separate files")
    parser.add_argument("wrkdir", type=str, help="working directory")
    parser.add_argument("xmfa_file",type=str, help="path to xmfa file generated by CollectGeneAlignments")
    opts = parser.parse_args()
    dir = opts.wrkdir
    file = opts.xmfa_file
    outdir = dir

    allseqs = open(file, "r")


    names = []
    positions = []
    seqline = 0
    last = -1
    badpositions = []
    for position, seq in enumerate(allseqs):
        ##skip ncbi genome
        if seq.startswith(">"):
            terms = seq.split(" ")
            gene_pos = terms[1]
            gene_pos = (str.rstrip(gene_pos))
            if gene_pos == "266+13483":
                badpositions.append(position)
            positions.append(position)

    ##the positions we don't want
    badpositions = set(badpositions)
    with open(file, "r") as master:
        master_full = master.readlines()

    outfile = file
    with open(outfile, "w+") as out:
        for j in tqdm(np.arange(0, len(positions))):
            if positions[j] in badpositions:
                continue
            header = master_full[positions[j]]
            out.write(header)
            start = positions[j]
            if j == len(positions)-1:
                end = len(master_full)
            else:
                end = positions[j+1]
            linenums = np.arange(start+1, end)
            for i in linenums:
                seqline = master_full[i]
                out.write(seqline)
if __name__ == "__main__":
    main()