Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 95 additions & 34 deletions gene_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"""
YOUR HEADER COMMENT HERE

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a short description(a.k.a header comment) here for next mini-project. It sounds pretty trivial but such documentation practice is going to help you and your team members in the long term

@author: YOUR NAME HERE
@author: Felix Eberhardt

"""

Expand All @@ -20,7 +20,7 @@ def shuffle_string(s):
# YOU WILL START YOUR IMPLEMENTATION FROM HERE DOWN ###


def get_complement(nucleotide):
def get_complement(nucleotide): #week1
""" Returns the complementary nucleotide

nucleotide: a nucleotide (A, C, G, or T) represented as a string
Expand All @@ -30,11 +30,17 @@ def get_complement(nucleotide):
>>> get_complement('C')
'G'
"""
# TODO: implement this
pass
if nucleotide == 'A':
return 'T'
elif nucleotide == 'C':
return 'G'
elif nucleotide == 'T':
return 'A'
elif nucleotide == 'G':
return 'C'


def get_reverse_complement(dna):
def get_reverse_complement(dna): #week1
""" Computes the reverse complementary sequence of DNA for the specfied DNA
sequence

Expand All @@ -45,11 +51,25 @@ def get_reverse_complement(dna):
>>> get_reverse_complement("CCGCGTTCA")
'TGAACGCGG'
"""
# TODO: implement this
pass


def rest_of_ORF(dna):
# Step 1 get complement
counter_1=0
complement_dna = ''
while counter_1 < len(dna):
complement_dna = complement_dna + get_complement(dna[counter_1])
counter_1 = counter_1 + 1
# Step 2 reverse it
reverse_complement = ''
counter_2=len(dna)-1
while counter_2 >= 0:
reverse_complement = reverse_complement + complement_dna[counter_2]
counter_2 = counter_2 - 1
return reverse_complement

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Going Beyond : Try using list comprehensions.

return [get_complement(c) for c in dna[::-1]]

for simple functinos like this, this one line of code is more readable than 10 lines of code that does the same thing


# Define Stop Codons
stop_codons = ['TAA', 'TAG', 'TGA']
start_codon = 'ATG'

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's usually recommended to put these defining statements right below the import statements because these variables are going to get referenced in many other functions that come below.

def rest_of_ORF(dna): #week1
""" Takes a DNA sequence that is assumed to begin with a start
codon and returns the sequence up to but not including the
first in frame stop codon. If there is no in frame stop codon,
Expand All @@ -62,11 +82,13 @@ def rest_of_ORF(dna):
>>> rest_of_ORF("ATGAGATAGG")
'ATGAGA'
"""
# TODO: implement this
pass

i=0

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This statement is unnecessary

for i in range(0,len(dna), 3):
if dna[i:i+3] in stop_codons:
return dna[:i]
return dna

def find_all_ORFs_oneframe(dna):
def find_all_ORFs_oneframe(dna): #week1
""" Finds all non-nested open reading frames in the given DNA
sequence and returns them as a list. This function should
only find ORFs that are in the default frame of the sequence
Expand All @@ -79,11 +101,19 @@ def find_all_ORFs_oneframe(dna):
>>> find_all_ORFs_oneframe("ATGCATGAATGTAGATAGATGTGCCC")
['ATGCATGAATGTAGA', 'ATGTGCCC']
"""
# TODO: implement this
pass


def find_all_ORFs(dna):
count=0
all_ORFs_oneframe = [] #empty list
while count < len(dna):
if dna[count:count+3] == start_codon:
orf = rest_of_ORF(dna[count:])
all_ORFs_oneframe.append(orf)
count = count + len(orf)
else :
count = count + 3
return all_ORFs_oneframe

def find_all_ORFs(dna): #week1
""" Finds all non-nested open reading frames in the given DNA sequence in
all 3 possible frames and returns them as a list. By non-nested we
mean that if an ORF occurs entirely within another ORF and they are
Expand All @@ -96,11 +126,10 @@ def find_all_ORFs(dna):
>>> find_all_ORFs("ATGCATGAATGTAG")
['ATGCATGAATGTAG', 'ATGAATGTAG', 'ATG']
"""
# TODO: implement this
pass

all_ORFs = find_all_ORFs_oneframe(dna) + find_all_ORFs_oneframe(dna[1:]) + find_all_ORFs_oneframe(dna[2:])
return all_ORFs

def find_all_ORFs_both_strands(dna):
def find_all_ORFs_both_strands(dna): #week1
""" Finds all non-nested open reading frames in the given DNA sequence on both
strands.

Expand All @@ -109,19 +138,22 @@ def find_all_ORFs_both_strands(dna):
>>> find_all_ORFs_both_strands("ATGCGAATGTAGCATCAAA")
['ATGCGAATG', 'ATGCTACATTCGCAT']
"""
# TODO: implement this
pass
# create empty lists
all_ORFs_both_strands = []
# search in them and add lists together
all_ORFs_both_strands = find_all_ORFs(dna) + find_all_ORFs(get_reverse_complement(dna))
return all_ORFs_both_strands

##### week 2 ######

def longest_ORF(dna):
""" Finds the longest ORF on both strands of the specified DNA and returns it
as a string
>>> longest_ORF("ATGCGAATGTAGCATCAAA")
'ATGCTACATTCGCAT'
"""
# TODO: implement this
pass

longest_ORF = max(find_all_ORFs_both_strands(dna), key=len)
return longest_ORF

def longest_ORF_noncoding(dna, num_trials):
""" Computes the maximum length of the longest ORF over num_trials shuffles
Expand All @@ -130,8 +162,18 @@ def longest_ORF_noncoding(dna, num_trials):
dna: a DNA sequence
num_trials: the number of random shuffles
returns: the maximum length longest ORF """
# TODO: implement this
pass

t=0
ORF_noncoding = []
while t < num_trials: # loop it for num_trials times
a = shuffle_string(dna) # use shuffling function to create new dna
b = longest_ORF(a) # put it in longest_ORF
ORF_noncoding.append(b) # return len as integer to list
t = t + 1
longest_ORF_noncoding = max(ORF_noncoding, key=len)# Look for and return the longest ORF (same function as used above)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something to think about : can you do this without saving each ORF into a list?

max_length = len(longest_ORF_noncoding)
return max_length



def coding_strand_to_AA(dna):
Expand All @@ -148,8 +190,14 @@ def coding_strand_to_AA(dna):
>>> coding_strand_to_AA("ATGCCCGCTTT")
'MPA'
"""
# TODO: implement this
pass

strand_to_AA = '' # empty string to append
for n in range(0, len(dna), 3): #go through dna string
for i in range(0, len(codons)): #go through inner lists
if dna[n:n+3] in codons[i]: #search for dna in each inn
amino = aa[i] # convert each triplet into the linked letter
strand_to_AA += amino # append it to the amino string
return strand_to_AA #return the string of aminos


def gene_finder(dna):
Expand All @@ -158,9 +206,22 @@ def gene_finder(dna):
dna: a DNA sequence
returns: a list of all amino acid sequences coded by the sequence dna.
"""
# TODO: implement this
pass
threshold = longest_ORF_noncoding(dna, 1500) # sets the treshold
open_reading_frames = find_all_ORFs_both_strands(dna) # find all open reading frames on both strands
i = 0
aa_sequence = []
for i in range(len(open_reading_frames)):
if len(open_reading_frames[i]) > threshold:
aminos = coding_strand_to_AA(open_reading_frames[i]) #return the amino_acids
aa_sequence.append(aminos) #add it to the list
i += 1
return aa_sequence ## return the list containing the amino acid sequence encoded longer than treshold

if __name__ == "__main__":
import doctest
doctest.testmod()
# Importing dna
from load import load_seq
dna = load_seq("./data/X73525.fa")
print(gene_finder(dna)) #execute function
# doctest.testmod(verbose=True)
#doctest.run_docstring_examples(find_all_ORFs, globals(), verbose=True)