-
Notifications
You must be signed in to change notification settings - Fork 18
Completed mp1 part2 #3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,7 +2,7 @@ | |
| """ | ||
| YOUR HEADER COMMENT HERE | ||
|
|
||
| @author: YOUR NAME HERE | ||
| @author: Felix Eberhardt | ||
|
|
||
| """ | ||
|
|
||
|
|
@@ -20,7 +20,7 @@ def shuffle_string(s): | |
| # YOU WILL START YOUR IMPLEMENTATION FROM HERE DOWN ### | ||
|
|
||
|
|
||
| def get_complement(nucleotide): | ||
| def get_complement(nucleotide): #week1 | ||
| """ Returns the complementary nucleotide | ||
|
|
||
| nucleotide: a nucleotide (A, C, G, or T) represented as a string | ||
|
|
@@ -30,11 +30,17 @@ def get_complement(nucleotide): | |
| >>> get_complement('C') | ||
| 'G' | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
| if nucleotide == 'A': | ||
| return 'T' | ||
| elif nucleotide == 'C': | ||
| return 'G' | ||
| elif nucleotide == 'T': | ||
| return 'A' | ||
| elif nucleotide == 'G': | ||
| return 'C' | ||
|
|
||
|
|
||
| def get_reverse_complement(dna): | ||
| def get_reverse_complement(dna): #week1 | ||
| """ Computes the reverse complementary sequence of DNA for the specfied DNA | ||
| sequence | ||
|
|
||
|
|
@@ -45,11 +51,25 @@ def get_reverse_complement(dna): | |
| >>> get_reverse_complement("CCGCGTTCA") | ||
| 'TGAACGCGG' | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
|
|
||
|
|
||
| def rest_of_ORF(dna): | ||
| # Step 1 get complement | ||
| counter_1=0 | ||
| complement_dna = '' | ||
| while counter_1 < len(dna): | ||
| complement_dna = complement_dna + get_complement(dna[counter_1]) | ||
| counter_1 = counter_1 + 1 | ||
| # Step 2 reverse it | ||
| reverse_complement = '' | ||
| counter_2=len(dna)-1 | ||
| while counter_2 >= 0: | ||
| reverse_complement = reverse_complement + complement_dna[counter_2] | ||
| counter_2 = counter_2 - 1 | ||
| return reverse_complement | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Going Beyond : Try using list comprehensions.
for simple functinos like this, this one line of code is more readable than 10 lines of code that does the same thing |
||
|
|
||
| # Define Stop Codons | ||
| stop_codons = ['TAA', 'TAG', 'TGA'] | ||
| start_codon = 'ATG' | ||
|
|
||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's usually recommended to put these defining statements right below the import statements because these variables are going to get referenced in many other functions that come below. |
||
| def rest_of_ORF(dna): #week1 | ||
| """ Takes a DNA sequence that is assumed to begin with a start | ||
| codon and returns the sequence up to but not including the | ||
| first in frame stop codon. If there is no in frame stop codon, | ||
|
|
@@ -62,11 +82,13 @@ def rest_of_ORF(dna): | |
| >>> rest_of_ORF("ATGAGATAGG") | ||
| 'ATGAGA' | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
|
|
||
| i=0 | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This statement is unnecessary |
||
| for i in range(0,len(dna), 3): | ||
| if dna[i:i+3] in stop_codons: | ||
| return dna[:i] | ||
| return dna | ||
|
|
||
| def find_all_ORFs_oneframe(dna): | ||
| def find_all_ORFs_oneframe(dna): #week1 | ||
| """ Finds all non-nested open reading frames in the given DNA | ||
| sequence and returns them as a list. This function should | ||
| only find ORFs that are in the default frame of the sequence | ||
|
|
@@ -79,11 +101,19 @@ def find_all_ORFs_oneframe(dna): | |
| >>> find_all_ORFs_oneframe("ATGCATGAATGTAGATAGATGTGCCC") | ||
| ['ATGCATGAATGTAGA', 'ATGTGCCC'] | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
|
|
||
|
|
||
| def find_all_ORFs(dna): | ||
| count=0 | ||
| all_ORFs_oneframe = [] #empty list | ||
| while count < len(dna): | ||
| if dna[count:count+3] == start_codon: | ||
| orf = rest_of_ORF(dna[count:]) | ||
| all_ORFs_oneframe.append(orf) | ||
| count = count + len(orf) | ||
| else : | ||
| count = count + 3 | ||
| return all_ORFs_oneframe | ||
|
|
||
| def find_all_ORFs(dna): #week1 | ||
| """ Finds all non-nested open reading frames in the given DNA sequence in | ||
| all 3 possible frames and returns them as a list. By non-nested we | ||
| mean that if an ORF occurs entirely within another ORF and they are | ||
|
|
@@ -96,11 +126,10 @@ def find_all_ORFs(dna): | |
| >>> find_all_ORFs("ATGCATGAATGTAG") | ||
| ['ATGCATGAATGTAG', 'ATGAATGTAG', 'ATG'] | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
|
|
||
| all_ORFs = find_all_ORFs_oneframe(dna) + find_all_ORFs_oneframe(dna[1:]) + find_all_ORFs_oneframe(dna[2:]) | ||
| return all_ORFs | ||
|
|
||
| def find_all_ORFs_both_strands(dna): | ||
| def find_all_ORFs_both_strands(dna): #week1 | ||
| """ Finds all non-nested open reading frames in the given DNA sequence on both | ||
| strands. | ||
|
|
||
|
|
@@ -109,19 +138,22 @@ def find_all_ORFs_both_strands(dna): | |
| >>> find_all_ORFs_both_strands("ATGCGAATGTAGCATCAAA") | ||
| ['ATGCGAATG', 'ATGCTACATTCGCAT'] | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
| # create empty lists | ||
| all_ORFs_both_strands = [] | ||
| # search in them and add lists together | ||
| all_ORFs_both_strands = find_all_ORFs(dna) + find_all_ORFs(get_reverse_complement(dna)) | ||
| return all_ORFs_both_strands | ||
|
|
||
| ##### week 2 ###### | ||
|
|
||
| def longest_ORF(dna): | ||
| """ Finds the longest ORF on both strands of the specified DNA and returns it | ||
| as a string | ||
| >>> longest_ORF("ATGCGAATGTAGCATCAAA") | ||
| 'ATGCTACATTCGCAT' | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
|
|
||
| longest_ORF = max(find_all_ORFs_both_strands(dna), key=len) | ||
| return longest_ORF | ||
|
|
||
| def longest_ORF_noncoding(dna, num_trials): | ||
| """ Computes the maximum length of the longest ORF over num_trials shuffles | ||
|
|
@@ -130,8 +162,18 @@ def longest_ORF_noncoding(dna, num_trials): | |
| dna: a DNA sequence | ||
| num_trials: the number of random shuffles | ||
| returns: the maximum length longest ORF """ | ||
| # TODO: implement this | ||
| pass | ||
|
|
||
| t=0 | ||
| ORF_noncoding = [] | ||
| while t < num_trials: # loop it for num_trials times | ||
| a = shuffle_string(dna) # use shuffling function to create new dna | ||
| b = longest_ORF(a) # put it in longest_ORF | ||
| ORF_noncoding.append(b) # return len as integer to list | ||
| t = t + 1 | ||
| longest_ORF_noncoding = max(ORF_noncoding, key=len)# Look for and return the longest ORF (same function as used above) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Something to think about : can you do this without saving each ORF into a list? |
||
| max_length = len(longest_ORF_noncoding) | ||
| return max_length | ||
|
|
||
|
|
||
|
|
||
| def coding_strand_to_AA(dna): | ||
|
|
@@ -148,8 +190,14 @@ def coding_strand_to_AA(dna): | |
| >>> coding_strand_to_AA("ATGCCCGCTTT") | ||
| 'MPA' | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
|
|
||
| strand_to_AA = '' # empty string to append | ||
| for n in range(0, len(dna), 3): #go through dna string | ||
| for i in range(0, len(codons)): #go through inner lists | ||
| if dna[n:n+3] in codons[i]: #search for dna in each inn | ||
| amino = aa[i] # convert each triplet into the linked letter | ||
| strand_to_AA += amino # append it to the amino string | ||
| return strand_to_AA #return the string of aminos | ||
|
|
||
|
|
||
| def gene_finder(dna): | ||
|
|
@@ -158,9 +206,22 @@ def gene_finder(dna): | |
| dna: a DNA sequence | ||
| returns: a list of all amino acid sequences coded by the sequence dna. | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
| threshold = longest_ORF_noncoding(dna, 1500) # sets the treshold | ||
| open_reading_frames = find_all_ORFs_both_strands(dna) # find all open reading frames on both strands | ||
| i = 0 | ||
| aa_sequence = [] | ||
| for i in range(len(open_reading_frames)): | ||
| if len(open_reading_frames[i]) > threshold: | ||
| aminos = coding_strand_to_AA(open_reading_frames[i]) #return the amino_acids | ||
| aa_sequence.append(aminos) #add it to the list | ||
| i += 1 | ||
| return aa_sequence ## return the list containing the amino acid sequence encoded longer than treshold | ||
|
|
||
| if __name__ == "__main__": | ||
| import doctest | ||
| doctest.testmod() | ||
| # Importing dna | ||
| from load import load_seq | ||
| dna = load_seq("./data/X73525.fa") | ||
| print(gene_finder(dna)) #execute function | ||
| # doctest.testmod(verbose=True) | ||
| #doctest.run_docstring_examples(find_all_ORFs, globals(), verbose=True) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add a short description(a.k.a header comment) here for next mini-project. It sounds pretty trivial but such documentation practice is going to help you and your team members in the long term