diff --git a/gene_finder.py b/gene_finder.py index 3b1e7dd..3a39509 100644 --- a/gene_finder.py +++ b/gene_finder.py @@ -9,6 +9,7 @@ import random from amino_acids import aa, codons, aa_table # you may find these useful from load import load_seq +dna = load_seq("./data/X73525.fa") def shuffle_string(s): @@ -29,7 +30,25 @@ def get_complement(nucleotide): 'T' >>> get_complement('C') 'G' + >>> get_complement('G') + 'C' + >>> get_complement('T') + 'A' + """ + complement_strand = "" + for pair in nucleotide: + if pair == 'A': + complement_strand = complement_strand + 'T' + if pair == 'C': + complement_strand = complement_strand + 'G' + if pair == 'T': + complement_strand = complement_strand + 'A' + if pair == 'G': + complement_strand = complement_strand + 'C' + #print (complement_strand) + return(complement_strand) + # TODO: implement this pass @@ -45,11 +64,18 @@ def get_reverse_complement(dna): >>> get_reverse_complement("CCGCGTTCA") 'TGAACGCGG' """ + rev_complement ="" + complement = get_complement(dna) + strand_length = len(complement) + while (strand_length > 0): #starts at the back of the strand + rev_complement = rev_complement + complement[strand_length-1] + strand_length -= 1 #moves backwards + return (rev_complement) # TODO: implement this pass -def rest_of_ORF(dna): +def rest_of_ORF(dna): # need to get this checked """ Takes a DNA sequence that is assumed to begin with a start codon and returns the sequence up to but not including the first in frame stop codon. If there is no in frame stop codon, @@ -61,8 +87,25 @@ def rest_of_ORF(dna): 'ATG' >>> rest_of_ORF("ATGAGATAGG") 'ATGAGA' + + >>> rest_of_ORF("TAG") + '' + """ + #stop codons are TAA, TAG, TGA # TODO: implement this + #stop_codons = ['TAA','TAG','TGA'] + stop_codons = ['TAA','TAG','TGA'] + + for i in range (0, len(dna)-1,3): # move though the strand every three bacepairs + + if ( len(dna) - i >=3): # only check for stop if there are three or more bace pairs left + if dna[i:i+3] in stop_codons: #stop if you find a stop codon + + return dna[0:i] #return dna before the codon + + + return dna pass @@ -78,10 +121,27 @@ def find_all_ORFs_oneframe(dna): returns: a list of non-nested ORFs >>> find_all_ORFs_oneframe("ATGCATGAATGTAGATAGATGTGCCC") ['ATGCATGAATGTAGA', 'ATGTGCCC'] + >>> find_all_ORFs_oneframe("ATGATGCATGAATGTAGATAGATGTGCCC") + ['ATGATGCATGAATGTAGA', 'ATGTGCCC'] """ - # TODO: implement this - pass - + all_pairs = len(dna) #records the length of the total DNA strand + total_checked = 0 #the number of strands checked. + orfs = [] #list of found orfs + + i =0 #The number of pairs that have been checked + + + while i <= all_pairs: #while loop goes through the length of the strand + stran = "" + if dna[i:i+3] == "ATG": #if a start codon is found + stran = "ATG" # records the start codon + #print (dna[i:]) + i = i + 3 #skipps the start codon + stran = stran + rest_of_ORF(dna[i:]) #add the rest of the orf to the start codon + orfs.append(stran) #saves the found orf + i = i + 3 #skips stop codon + i = i + 3 #moves to next framm + return orfs #resturns found orfs def find_all_ORFs(dna): """ Finds all non-nested open reading frames in the given DNA sequence in @@ -95,7 +155,14 @@ def find_all_ORFs(dna): >>> find_all_ORFs("ATGCATGAATGTAG") ['ATGCATGAATGTAG', 'ATGAATGTAG', 'ATG'] + """ + orf = [] + stran1 = find_all_ORFs_oneframe(dna) #finds orfs in frame 1 + orf = (stran1) + orf = orf + (find_all_ORFs_oneframe(dna[1:])) #finds orfs in frame 2 + orf = orf +(find_all_ORFs_oneframe(dna[2:])) #finds orfs in frame 3 + return orf #returns found codon # TODO: implement this pass @@ -109,6 +176,11 @@ def find_all_ORFs_both_strands(dna): >>> find_all_ORFs_both_strands("ATGCGAATGTAGCATCAAA") ['ATGCGAATG', 'ATGCTACATTCGCAT'] """ + + orf = [] + orf = orf + find_all_ORFs(dna) #finds orfs all refrence frames + orf = orf + find_all_ORFs(get_reverse_complement(dna)) #finds orf on the other side of the stran + return orf # TODO: implement this pass @@ -119,6 +191,7 @@ def longest_ORF(dna): >>> longest_ORF("ATGCGAATGTAGCATCAAA") 'ATGCTACATTCGCAT' """ + return max(find_all_ORFs_both_strands(dna), key=len); #returns longest orf # TODO: implement this pass @@ -129,7 +202,19 @@ def longest_ORF_noncoding(dna, num_trials): dna: a DNA sequence num_trials: the number of random shuffles - returns: the maximum length longest ORF """ + + """ + max_orf = "" + temp = "" + for i in range(0, num_trials): + random_dna = shuffle_string(dna) #suffles DNA + temp = longest_ORF(random_dna) # finds the longest orf in the shuffled dna + if len(temp) > len(max_orf): #if the current orf is longer than the previose longest + max_orf = str(temp) #make the current orf the longest + + + + return len(max_orf) #return the longest orf of all the orfs # TODO: implement this pass @@ -147,7 +232,29 @@ def coding_strand_to_AA(dna): 'MR' >>> coding_strand_to_AA("ATGCCCGCTTT") 'MPA' + + + + """ + # >>> coding_strand_to_AA("ATGCGAATGTAGCATCAAA") + # 'MPA' + + ans = '' + # for codon in codons: + # for c in codon: + # for i in range(0,len(dna),3): + # + # if dna[i:i+3] == c: + # ans = ans + aa[codons.index(codon)] + # + # + # return ans + for i in range(0,len(dna),3): #amino acids are three base pairs long + if len(dna[i:i+3]) < 3: #read as long as there are atleast 3 bace pairs left + continue + ans = ans + aa_table[dna[i:i+3]] #reads three bace pairs and matches it to an amino acid + return #return amino acid # TODO: implement this pass @@ -158,9 +265,30 @@ def gene_finder(dna): dna: a DNA sequence returns: a list of all amino acid sequences coded by the sequence dna. """ - # TODO: implement this - pass + found_dna = [] + long_acids = [] + threshold = longest_ORF_noncoding(dna, 1500) # longest random orf after 1500 samples + # print(threshold) + + #print(dna) + #print(longest_ORF_noncoding(dna,1000)) the average length is 332 110 amino acids + found_dna = find_all_ORFs_both_strands(dna) #finds all of the orfs in the dna + # print(found_dna) + found_acids = [] + for c in found_dna: + found_acids.append(coding_strand_to_AA(c)) #translates bacepairs into amino acids + for c in found_acids: + if len(c) >= threshold/3: #threshold was measured in bace pairs there are 3 base pairs per amino acid + long_acids.append(c) #list of amino acid orfs that are longer than the threshold + print("threshold: " + str(threshold)) + print ("Found Acids: ") + + return long_acids + + if __name__ == "__main__": import doctest - doctest.testmod() + # print(len(dna)) + + gene_finder(dna)