Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added Selection_006.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added __pycache__/amino_acids.pypy-26.pyc
Binary file not shown.
Binary file added __pycache__/load.pypy-26.pyc
Binary file not shown.
105 changes: 14 additions & 91 deletions gene_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def shuffle_string(s):

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Overall - good implementation of dictionaries and list comprehensions, and general cleaning up.

# YOU WILL START YOUR IMPLEMENTATION FROM HERE DOWN ###

complement_dict = {'A':'T', 'T':'A', 'C':'G', 'G':'C'}

def get_complement(nucleotide):
""" Returns the complementary nucleotide
Expand All @@ -32,19 +33,7 @@ def get_complement(nucleotide):
>>> get_complement('G')
'C'
"""

#series of if elif statements that return complementary nucleotides
if nucleotide == 'A':
return 'T'
elif nucleotide == 'T':
return 'A'
elif nucleotide == 'C':
return 'G'
elif nucleotide == 'G':
return 'C'
else:
print "Invalid Arguemnt not a nucleotide"
return "A"
return complement_dict[nucleotide]

def get_reverse_complement(dna):
""" Computes the reverse complementary sequence of DNA for the specfied DNA
Expand All @@ -58,12 +47,7 @@ def get_reverse_complement(dna):
'TGAACGCGG'
"""

new_string = ""

for i in dna[::-1]: #iterate from the last index to the first index
new_string = new_string + get_complement(i)

return new_string
return ''.join(get_complement(n) for n in dna[::-1])
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Effective use of list comprehensions.


def rest_of_ORF(dna):
""" Takes a DNA sequence that is assumed to begin with a start
Expand All @@ -90,7 +74,7 @@ def rest_of_ORF(dna):
if dna[i:i+3] in ['TAA','TAG','TGA']:
break
#add dna triple to new_string
new_string= new_string + dna[i:i+3]
new_string = new_string + dna[i:i+3]

#increase counter by 3 because we added triple
i = i + 3
Expand Down Expand Up @@ -138,7 +122,6 @@ def find_all_ORFs(dna):
>>> find_all_ORFs("ATGCATGAATGTAG")
['ATGCATGAATGTAG', 'ATGAATGTAG', 'ATG']
"""

dna_list = []
for i in range(3): #go through each possible frames
dna_list = dna_list + find_all_ORFs_oneframe(dna[i:])
Expand All @@ -159,9 +142,8 @@ def find_all_ORFs_both_strands(dna):
dna_list = [] #add orf's to list then return


for list in string_list:
temp_list = find_all_ORFs(list)
for item in temp_list:
for temp in string_list:
for item in find_all_ORFs(temp):
dna_list.append(item)

return dna_list
Expand All @@ -187,14 +169,9 @@ def longest_ORF_noncoding(dna, num_trials):
dna: a DNA sequence
num_trials: the number of random shuffles
returns: the maximum length longest ORF """
maximum_length = 0 #maximum length will store the strand with the maximum length
string = "" #this is the string that we will return
for i in range(num_trials): #run program num_trials times
shuffled = longest_ORF(shuffle_string(dna)) #get the longest string frum shuffled input
if len(shuffled) > maximum_length: #compare length of string to previous maximum
maximum_length = len(shuffled) #assign new maximum if true
string = shuffled
return string

return max(len(longest_ORF(shuffle_string(dna))) for test in xrange(num_trials))


def coding_strand_to_AA(dna):
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is miles better than the mess of ifs and elses you had before.

""" Computes the Protein encoded by a sequence of DNA. This function
Expand All @@ -210,63 +187,9 @@ def coding_strand_to_AA(dna):
>>> coding_strand_to_AA("ATGCCCGCTTT")
'MPA'
"""
i = 0
protein = "" #protein stores each amino value for triples
amino = "" #a value which we append to protein, will change depending on triple catigorization
for i in range(0,len(dna),3): #we step 3 because we want triples
triple = dna[i:i+3] #get the next three codons from dna

if len (triple) == 3: #make sure triple has length 3, appending occures at the end of this statement
#lots of if elif statements to categorize dna to amino acid
#each will assign a new value to amino
#yes I do know about dictionaries
if triple in ["TTT","TTC"]:
amino = "F"
elif triple in ['TTA','TTG','CTT','CTC','CTA','CTG']:
amino = "L"
elif triple in ['ATT','ATC','ATA']:
amino = "I"
elif triple in ["ATG"]:
amino = "M"
elif triple in ['GTT','GTC','GTA','GTG']:
amino = "V"
elif triple in ['TCT','TCC','TCA','TCG']:
amino = "S"
elif triple in ['CCT','CCC','CCA','CCG']:
amino = "P"
elif triple in ['ACT','ACC','ACA','ACG']:
amino = "T"
elif triple in ['GCT','GCC','GCA','GCG']:
amino = "A"
elif triple in ['TAT','TAC']:
amino = "Y"
elif triple in ['CAT','CAC']:
amino = "H"
elif triple in ['CAA','CAG']:
amino = "Q"
elif triple in ['AAT','AAC']:
amino = "N"
elif triple in ['AAA','AAG']:
amino = "K"
elif triple in ['GAT','GAC']:
amino = "D"
elif triple in ['GAA','GAG']:
amino = "E"
elif triple in ['TGT','TGC']:
amino = "C"
elif triple in ['TGG']:
amino = "W"
elif triple in ['CGT','CGC','CGA','CGG']:
amino = "R"
elif triple in ['AGT','AGC']:
amino = "S"
elif triple in ['AGT','AGC']:
amino = "R"
elif triple in ['GGT','GGC','GGA','GGG']:
amino = "G"
# end categorization
protein = protein + amino #append amino value to protein
return protein #return string
protein = ""
protein.join(aa_table[dna[i:i+3]] for i in xrange(0, len(dna)-2, 3))
return protein

def gene_finder(dna):
""" Returns the amino acid sequences that are likely coded by the specified dna
Expand All @@ -278,7 +201,7 @@ def gene_finder(dna):
threshold = longest_ORF_noncoding(dna,1500)
all_ORFs = find_all_ORFs_both_strands(dna)
for item in all_ORFs:
if len(item) > len(threshold):
if len(item) > threshold:
AA_list.append(coding_strand_to_AA(item))
return AA_list

Expand All @@ -287,7 +210,7 @@ def gene_finder(dna):
contigs = load_contigs()
name,dna = contigs[5]

print gene_finder(dna)
gene_finder(dna)
print("---%s seconds ---" % (time.time()-start_time))

if __name__ == "__main__":
Expand Down