-
Notifications
You must be signed in to change notification settings - Fork 18
Harrison Young Gene Finder Project #11
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,6 +9,7 @@ | |
| import random | ||
| from amino_acids import aa, codons, aa_table # you may find these useful | ||
| from load import load_seq | ||
| dna = load_seq("./data/X73525.fa") | ||
|
|
||
|
|
||
| def shuffle_string(s): | ||
|
|
@@ -29,7 +30,25 @@ def get_complement(nucleotide): | |
| 'T' | ||
| >>> get_complement('C') | ||
| 'G' | ||
| >>> get_complement('G') | ||
| 'C' | ||
| >>> get_complement('T') | ||
| 'A' | ||
|
|
||
| """ | ||
| complement_strand = "" | ||
| for pair in nucleotide: | ||
| if pair == 'A': | ||
| complement_strand = complement_strand + 'T' | ||
| if pair == 'C': | ||
| complement_strand = complement_strand + 'G' | ||
| if pair == 'T': | ||
| complement_strand = complement_strand + 'A' | ||
| if pair == 'G': | ||
| complement_strand = complement_strand + 'C' | ||
| #print (complement_strand) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please remove comments for the final code |
||
| return(complement_strand) | ||
|
|
||
| # TODO: implement this | ||
| pass | ||
|
|
||
|
|
@@ -45,11 +64,18 @@ def get_reverse_complement(dna): | |
| >>> get_reverse_complement("CCGCGTTCA") | ||
| 'TGAACGCGG' | ||
| """ | ||
| rev_complement ="" | ||
| complement = get_complement(dna) | ||
| strand_length = len(complement) | ||
| while (strand_length > 0): #starts at the back of the strand | ||
| rev_complement = rev_complement + complement[strand_length-1] | ||
| strand_length -= 1 #moves backwards | ||
| return (rev_complement) | ||
| # TODO: implement this | ||
| pass | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These are simple placeholders, so no need to leave them in your final code |
||
|
|
||
|
|
||
| def rest_of_ORF(dna): | ||
| def rest_of_ORF(dna): # need to get this checked | ||
| """ Takes a DNA sequence that is assumed to begin with a start | ||
| codon and returns the sequence up to but not including the | ||
| first in frame stop codon. If there is no in frame stop codon, | ||
|
|
@@ -61,8 +87,25 @@ def rest_of_ORF(dna): | |
| 'ATG' | ||
| >>> rest_of_ORF("ATGAGATAGG") | ||
| 'ATGAGA' | ||
|
|
||
| >>> rest_of_ORF("TAG") | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 to custom unit testing! |
||
| '' | ||
|
|
||
| """ | ||
| #stop codons are TAA, TAG, TGA | ||
| # TODO: implement this | ||
| #stop_codons = ['TAA','TAG','TGA'] | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove comments please |
||
| stop_codons = ['TAA','TAG','TGA'] | ||
|
|
||
| for i in range (0, len(dna)-1,3): # move though the strand every three bacepairs | ||
|
|
||
| if ( len(dna) - i >=3): # only check for stop if there are three or more bace pairs left | ||
| if dna[i:i+3] in stop_codons: #stop if you find a stop codon | ||
|
|
||
| return dna[0:i] #return dna before the codon | ||
|
|
||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like your inline comments. But you don't need to write them for every line if you feel like you are spending too much time explaining trivial things into words. |
||
|
|
||
| return dna | ||
| pass | ||
|
|
||
|
|
||
|
|
@@ -78,10 +121,27 @@ def find_all_ORFs_oneframe(dna): | |
| returns: a list of non-nested ORFs | ||
| >>> find_all_ORFs_oneframe("ATGCATGAATGTAGATAGATGTGCCC") | ||
| ['ATGCATGAATGTAGA', 'ATGTGCCC'] | ||
| >>> find_all_ORFs_oneframe("ATGATGCATGAATGTAGATAGATGTGCCC") | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 another good unit test |
||
| ['ATGATGCATGAATGTAGA', 'ATGTGCCC'] | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
|
|
||
| all_pairs = len(dna) #records the length of the total DNA strand | ||
| total_checked = 0 #the number of strands checked. | ||
| orfs = [] #list of found orfs | ||
|
|
||
| i =0 #The number of pairs that have been checked | ||
|
|
||
|
|
||
| while i <= all_pairs: #while loop goes through the length of the strand | ||
| stran = "" | ||
| if dna[i:i+3] == "ATG": #if a start codon is found | ||
| stran = "ATG" # records the start codon | ||
| #print (dna[i:]) | ||
| i = i + 3 #skipps the start codon | ||
| stran = stran + rest_of_ORF(dna[i:]) #add the rest of the orf to the start codon | ||
| orfs.append(stran) #saves the found orf | ||
| i = i + 3 #skips stop codon | ||
| i = i + 3 #moves to next framm | ||
| return orfs #resturns found orfs | ||
|
|
||
| def find_all_ORFs(dna): | ||
| """ Finds all non-nested open reading frames in the given DNA sequence in | ||
|
|
@@ -95,7 +155,14 @@ def find_all_ORFs(dna): | |
|
|
||
| >>> find_all_ORFs("ATGCATGAATGTAG") | ||
| ['ATGCATGAATGTAG', 'ATGAATGTAG', 'ATG'] | ||
|
|
||
| """ | ||
| orf = [] | ||
| stran1 = find_all_ORFs_oneframe(dna) #finds orfs in frame 1 | ||
| orf = (stran1) | ||
| orf = orf + (find_all_ORFs_oneframe(dna[1:])) #finds orfs in frame 2 | ||
| orf = orf +(find_all_ORFs_oneframe(dna[2:])) #finds orfs in frame 3 | ||
| return orf #returns found codon | ||
| # TODO: implement this | ||
| pass | ||
|
|
||
|
|
@@ -109,6 +176,11 @@ def find_all_ORFs_both_strands(dna): | |
| >>> find_all_ORFs_both_strands("ATGCGAATGTAGCATCAAA") | ||
| ['ATGCGAATG', 'ATGCTACATTCGCAT'] | ||
| """ | ||
|
|
||
| orf = [] | ||
| orf = orf + find_all_ORFs(dna) #finds orfs all refrence frames | ||
| orf = orf + find_all_ORFs(get_reverse_complement(dna)) #finds orf on the other side of the stran | ||
| return orf | ||
| # TODO: implement this | ||
| pass | ||
|
|
||
|
|
@@ -119,6 +191,7 @@ def longest_ORF(dna): | |
| >>> longest_ORF("ATGCGAATGTAGCATCAAA") | ||
| 'ATGCTACATTCGCAT' | ||
| """ | ||
| return max(find_all_ORFs_both_strands(dna), key=len); #returns longest orf | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice concise & efficient code |
||
| # TODO: implement this | ||
| pass | ||
|
|
||
|
|
@@ -129,7 +202,19 @@ def longest_ORF_noncoding(dna, num_trials): | |
|
|
||
| dna: a DNA sequence | ||
| num_trials: the number of random shuffles | ||
| returns: the maximum length longest ORF """ | ||
|
|
||
| """ | ||
| max_orf = "" | ||
| temp = "" | ||
| for i in range(0, num_trials): | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. one stylistic convention. |
||
| random_dna = shuffle_string(dna) #suffles DNA | ||
| temp = longest_ORF(random_dna) # finds the longest orf in the shuffled dna | ||
| if len(temp) > len(max_orf): #if the current orf is longer than the previose longest | ||
| max_orf = str(temp) #make the current orf the longest | ||
|
|
||
|
|
||
|
|
||
| return len(max_orf) #return the longest orf of all the orfs | ||
| # TODO: implement this | ||
| pass | ||
|
|
||
|
|
@@ -147,7 +232,29 @@ def coding_strand_to_AA(dna): | |
| 'MR' | ||
| >>> coding_strand_to_AA("ATGCCCGCTTT") | ||
| 'MPA' | ||
|
|
||
|
|
||
|
|
||
|
|
||
| """ | ||
| # >>> coding_strand_to_AA("ATGCGAATGTAGCATCAAA") | ||
| # 'MPA' | ||
|
|
||
| ans = '' | ||
| # for codon in codons: | ||
| # for c in codon: | ||
| # for i in range(0,len(dna),3): | ||
| # | ||
| # if dna[i:i+3] == c: | ||
| # ans = ans + aa[codons.index(codon)] | ||
| # | ||
| # | ||
| # return ans | ||
| for i in range(0,len(dna),3): #amino acids are three base pairs long | ||
| if len(dna[i:i+3]) < 3: #read as long as there are atleast 3 bace pairs left | ||
| continue | ||
| ans = ans + aa_table[dna[i:i+3]] #reads three bace pairs and matches it to an amino acid | ||
| return #return amino acid | ||
| # TODO: implement this | ||
| pass | ||
|
|
||
|
|
@@ -158,9 +265,30 @@ def gene_finder(dna): | |
| dna: a DNA sequence | ||
| returns: a list of all amino acid sequences coded by the sequence dna. | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
| found_dna = [] | ||
| long_acids = [] | ||
| threshold = longest_ORF_noncoding(dna, 1500) # longest random orf after 1500 samples | ||
| # print(threshold) | ||
|
|
||
| #print(dna) | ||
| #print(longest_ORF_noncoding(dna,1000)) the average length is 332 110 amino acids | ||
| found_dna = find_all_ORFs_both_strands(dna) #finds all of the orfs in the dna | ||
| # print(found_dna) | ||
| found_acids = [] | ||
| for c in found_dna: | ||
| found_acids.append(coding_strand_to_AA(c)) #translates bacepairs into amino acids | ||
| for c in found_acids: | ||
| if len(c) >= threshold/3: #threshold was measured in bace pairs there are 3 base pairs per amino acid | ||
| long_acids.append(c) #list of amino acid orfs that are longer than the threshold | ||
| print("threshold: " + str(threshold)) | ||
| print ("Found Acids: ") | ||
|
|
||
| return long_acids | ||
|
|
||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| import doctest | ||
| doctest.testmod() | ||
| # print(len(dna)) | ||
|
|
||
| gene_finder(dna) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
While the logic here is similar for both
importandload_seq, the former is a module import statement while the latter is a function call. It is conventional to keepimportstatements separate from the rest of the code. In this case,load_seqshould be in__main__