Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 136 additions & 8 deletions gene_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import random
from amino_acids import aa, codons, aa_table # you may find these useful
from load import load_seq
dna = load_seq("./data/X73525.fa")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While the logic here is similar for both import and load_seq, the former is a module import statement while the latter is a function call. It is conventional to keep import statements separate from the rest of the code. In this case, load_seq should be in __main__



def shuffle_string(s):
Expand All @@ -29,7 +30,25 @@ def get_complement(nucleotide):
'T'
>>> get_complement('C')
'G'
>>> get_complement('G')
'C'
>>> get_complement('T')
'A'

"""
complement_strand = ""
for pair in nucleotide:
if pair == 'A':
complement_strand = complement_strand + 'T'
if pair == 'C':
complement_strand = complement_strand + 'G'
if pair == 'T':
complement_strand = complement_strand + 'A'
if pair == 'G':
complement_strand = complement_strand + 'C'
#print (complement_strand)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please remove comments for the final code

return(complement_strand)

# TODO: implement this
pass

Expand All @@ -45,11 +64,18 @@ def get_reverse_complement(dna):
>>> get_reverse_complement("CCGCGTTCA")
'TGAACGCGG'
"""
rev_complement =""
complement = get_complement(dna)
strand_length = len(complement)
while (strand_length > 0): #starts at the back of the strand
rev_complement = rev_complement + complement[strand_length-1]
strand_length -= 1 #moves backwards
return (rev_complement)
# TODO: implement this
pass

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are simple placeholders, so no need to leave them in your final code



def rest_of_ORF(dna):
def rest_of_ORF(dna): # need to get this checked
""" Takes a DNA sequence that is assumed to begin with a start
codon and returns the sequence up to but not including the
first in frame stop codon. If there is no in frame stop codon,
Expand All @@ -61,8 +87,25 @@ def rest_of_ORF(dna):
'ATG'
>>> rest_of_ORF("ATGAGATAGG")
'ATGAGA'

>>> rest_of_ORF("TAG")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 to custom unit testing!

''

"""
#stop codons are TAA, TAG, TGA
# TODO: implement this
#stop_codons = ['TAA','TAG','TGA']

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove comments please

stop_codons = ['TAA','TAG','TGA']

for i in range (0, len(dna)-1,3): # move though the strand every three bacepairs

if ( len(dna) - i >=3): # only check for stop if there are three or more bace pairs left
if dna[i:i+3] in stop_codons: #stop if you find a stop codon

return dna[0:i] #return dna before the codon

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like your inline comments. But you don't need to write them for every line if you feel like you are spending too much time explaining trivial things into words.


return dna
pass


Expand All @@ -78,10 +121,27 @@ def find_all_ORFs_oneframe(dna):
returns: a list of non-nested ORFs
>>> find_all_ORFs_oneframe("ATGCATGAATGTAGATAGATGTGCCC")
['ATGCATGAATGTAGA', 'ATGTGCCC']
>>> find_all_ORFs_oneframe("ATGATGCATGAATGTAGATAGATGTGCCC")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 another good unit test

['ATGATGCATGAATGTAGA', 'ATGTGCCC']
"""
# TODO: implement this
pass

all_pairs = len(dna) #records the length of the total DNA strand
total_checked = 0 #the number of strands checked.
orfs = [] #list of found orfs

i =0 #The number of pairs that have been checked


while i <= all_pairs: #while loop goes through the length of the strand
stran = ""
if dna[i:i+3] == "ATG": #if a start codon is found
stran = "ATG" # records the start codon
#print (dna[i:])
i = i + 3 #skipps the start codon
stran = stran + rest_of_ORF(dna[i:]) #add the rest of the orf to the start codon
orfs.append(stran) #saves the found orf
i = i + 3 #skips stop codon
i = i + 3 #moves to next framm
return orfs #resturns found orfs

def find_all_ORFs(dna):
""" Finds all non-nested open reading frames in the given DNA sequence in
Expand All @@ -95,7 +155,14 @@ def find_all_ORFs(dna):

>>> find_all_ORFs("ATGCATGAATGTAG")
['ATGCATGAATGTAG', 'ATGAATGTAG', 'ATG']

"""
orf = []
stran1 = find_all_ORFs_oneframe(dna) #finds orfs in frame 1
orf = (stran1)
orf = orf + (find_all_ORFs_oneframe(dna[1:])) #finds orfs in frame 2
orf = orf +(find_all_ORFs_oneframe(dna[2:])) #finds orfs in frame 3
return orf #returns found codon
# TODO: implement this
pass

Expand All @@ -109,6 +176,11 @@ def find_all_ORFs_both_strands(dna):
>>> find_all_ORFs_both_strands("ATGCGAATGTAGCATCAAA")
['ATGCGAATG', 'ATGCTACATTCGCAT']
"""

orf = []
orf = orf + find_all_ORFs(dna) #finds orfs all refrence frames
orf = orf + find_all_ORFs(get_reverse_complement(dna)) #finds orf on the other side of the stran
return orf
# TODO: implement this
pass

Expand All @@ -119,6 +191,7 @@ def longest_ORF(dna):
>>> longest_ORF("ATGCGAATGTAGCATCAAA")
'ATGCTACATTCGCAT'
"""
return max(find_all_ORFs_both_strands(dna), key=len); #returns longest orf

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice concise & efficient code

# TODO: implement this
pass

Expand All @@ -129,7 +202,19 @@ def longest_ORF_noncoding(dna, num_trials):

dna: a DNA sequence
num_trials: the number of random shuffles
returns: the maximum length longest ORF """

"""
max_orf = ""
temp = ""
for i in range(0, num_trials):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

one stylistic convention. for _ in range(0, num_trials):

random_dna = shuffle_string(dna) #suffles DNA
temp = longest_ORF(random_dna) # finds the longest orf in the shuffled dna
if len(temp) > len(max_orf): #if the current orf is longer than the previose longest
max_orf = str(temp) #make the current orf the longest



return len(max_orf) #return the longest orf of all the orfs
# TODO: implement this
pass

Expand All @@ -147,7 +232,29 @@ def coding_strand_to_AA(dna):
'MR'
>>> coding_strand_to_AA("ATGCCCGCTTT")
'MPA'




"""
# >>> coding_strand_to_AA("ATGCGAATGTAGCATCAAA")
# 'MPA'

ans = ''
# for codon in codons:
# for c in codon:
# for i in range(0,len(dna),3):
#
# if dna[i:i+3] == c:
# ans = ans + aa[codons.index(codon)]
#
#
# return ans
for i in range(0,len(dna),3): #amino acids are three base pairs long
if len(dna[i:i+3]) < 3: #read as long as there are atleast 3 bace pairs left
continue
ans = ans + aa_table[dna[i:i+3]] #reads three bace pairs and matches it to an amino acid
return #return amino acid
# TODO: implement this
pass

Expand All @@ -158,9 +265,30 @@ def gene_finder(dna):
dna: a DNA sequence
returns: a list of all amino acid sequences coded by the sequence dna.
"""
# TODO: implement this
pass
found_dna = []
long_acids = []
threshold = longest_ORF_noncoding(dna, 1500) # longest random orf after 1500 samples
# print(threshold)

#print(dna)
#print(longest_ORF_noncoding(dna,1000)) the average length is 332 110 amino acids
found_dna = find_all_ORFs_both_strands(dna) #finds all of the orfs in the dna
# print(found_dna)
found_acids = []
for c in found_dna:
found_acids.append(coding_strand_to_AA(c)) #translates bacepairs into amino acids
for c in found_acids:
if len(c) >= threshold/3: #threshold was measured in bace pairs there are 3 base pairs per amino acid
long_acids.append(c) #list of amino acid orfs that are longer than the threshold
print("threshold: " + str(threshold))
print ("Found Acids: ")

return long_acids



if __name__ == "__main__":
import doctest
doctest.testmod()
# print(len(dna))

gene_finder(dna)