forked from IanOlin/GeneFinder
-
Notifications
You must be signed in to change notification settings - Fork 0
Revisiting Genefinder #1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
NathanYee
wants to merge
5
commits into
release
Choose a base branch
from
master
base: release
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
782f7e6
4% speed increase through use of hashtable
a076bf6
Used a dictionary in get_reverse_complement, another 4% speed increase
baeaaf6
added a list comprehension to reverse complement, no speed increase b…
20f30ff
added more hashtables and list comprehensions
97be046
cleaned up reverse_complement
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Binary file not shown.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,7 @@ def shuffle_string(s): | |
|
|
||
| # YOU WILL START YOUR IMPLEMENTATION FROM HERE DOWN ### | ||
|
|
||
| complement_dict = {'A':'T', 'T':'A', 'C':'G', 'G':'C'} | ||
|
|
||
| def get_complement(nucleotide): | ||
| """ Returns the complementary nucleotide | ||
|
|
@@ -32,19 +33,7 @@ def get_complement(nucleotide): | |
| >>> get_complement('G') | ||
| 'C' | ||
| """ | ||
|
|
||
| #series of if elif statements that return complementary nucleotides | ||
| if nucleotide == 'A': | ||
| return 'T' | ||
| elif nucleotide == 'T': | ||
| return 'A' | ||
| elif nucleotide == 'C': | ||
| return 'G' | ||
| elif nucleotide == 'G': | ||
| return 'C' | ||
| else: | ||
| print "Invalid Arguemnt not a nucleotide" | ||
| return "A" | ||
| return complement_dict[nucleotide] | ||
|
|
||
| def get_reverse_complement(dna): | ||
| """ Computes the reverse complementary sequence of DNA for the specfied DNA | ||
|
|
@@ -58,12 +47,7 @@ def get_reverse_complement(dna): | |
| 'TGAACGCGG' | ||
| """ | ||
|
|
||
| new_string = "" | ||
|
|
||
| for i in dna[::-1]: #iterate from the last index to the first index | ||
| new_string = new_string + get_complement(i) | ||
|
|
||
| return new_string | ||
| return ''.join(get_complement(n) for n in dna[::-1]) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Effective use of list comprehensions. |
||
|
|
||
| def rest_of_ORF(dna): | ||
| """ Takes a DNA sequence that is assumed to begin with a start | ||
|
|
@@ -90,7 +74,7 @@ def rest_of_ORF(dna): | |
| if dna[i:i+3] in ['TAA','TAG','TGA']: | ||
| break | ||
| #add dna triple to new_string | ||
| new_string= new_string + dna[i:i+3] | ||
| new_string = new_string + dna[i:i+3] | ||
|
|
||
| #increase counter by 3 because we added triple | ||
| i = i + 3 | ||
|
|
@@ -138,7 +122,6 @@ def find_all_ORFs(dna): | |
| >>> find_all_ORFs("ATGCATGAATGTAG") | ||
| ['ATGCATGAATGTAG', 'ATGAATGTAG', 'ATG'] | ||
| """ | ||
|
|
||
| dna_list = [] | ||
| for i in range(3): #go through each possible frames | ||
| dna_list = dna_list + find_all_ORFs_oneframe(dna[i:]) | ||
|
|
@@ -159,9 +142,8 @@ def find_all_ORFs_both_strands(dna): | |
| dna_list = [] #add orf's to list then return | ||
|
|
||
|
|
||
| for list in string_list: | ||
| temp_list = find_all_ORFs(list) | ||
| for item in temp_list: | ||
| for temp in string_list: | ||
| for item in find_all_ORFs(temp): | ||
| dna_list.append(item) | ||
|
|
||
| return dna_list | ||
|
|
@@ -187,14 +169,9 @@ def longest_ORF_noncoding(dna, num_trials): | |
| dna: a DNA sequence | ||
| num_trials: the number of random shuffles | ||
| returns: the maximum length longest ORF """ | ||
| maximum_length = 0 #maximum length will store the strand with the maximum length | ||
| string = "" #this is the string that we will return | ||
| for i in range(num_trials): #run program num_trials times | ||
| shuffled = longest_ORF(shuffle_string(dna)) #get the longest string frum shuffled input | ||
| if len(shuffled) > maximum_length: #compare length of string to previous maximum | ||
| maximum_length = len(shuffled) #assign new maximum if true | ||
| string = shuffled | ||
| return string | ||
|
|
||
| return max(len(longest_ORF(shuffle_string(dna))) for test in xrange(num_trials)) | ||
|
|
||
|
|
||
| def coding_strand_to_AA(dna): | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is miles better than the mess of ifs and elses you had before. |
||
| """ Computes the Protein encoded by a sequence of DNA. This function | ||
|
|
@@ -210,63 +187,9 @@ def coding_strand_to_AA(dna): | |
| >>> coding_strand_to_AA("ATGCCCGCTTT") | ||
| 'MPA' | ||
| """ | ||
| i = 0 | ||
| protein = "" #protein stores each amino value for triples | ||
| amino = "" #a value which we append to protein, will change depending on triple catigorization | ||
| for i in range(0,len(dna),3): #we step 3 because we want triples | ||
| triple = dna[i:i+3] #get the next three codons from dna | ||
|
|
||
| if len (triple) == 3: #make sure triple has length 3, appending occures at the end of this statement | ||
| #lots of if elif statements to categorize dna to amino acid | ||
| #each will assign a new value to amino | ||
| #yes I do know about dictionaries | ||
| if triple in ["TTT","TTC"]: | ||
| amino = "F" | ||
| elif triple in ['TTA','TTG','CTT','CTC','CTA','CTG']: | ||
| amino = "L" | ||
| elif triple in ['ATT','ATC','ATA']: | ||
| amino = "I" | ||
| elif triple in ["ATG"]: | ||
| amino = "M" | ||
| elif triple in ['GTT','GTC','GTA','GTG']: | ||
| amino = "V" | ||
| elif triple in ['TCT','TCC','TCA','TCG']: | ||
| amino = "S" | ||
| elif triple in ['CCT','CCC','CCA','CCG']: | ||
| amino = "P" | ||
| elif triple in ['ACT','ACC','ACA','ACG']: | ||
| amino = "T" | ||
| elif triple in ['GCT','GCC','GCA','GCG']: | ||
| amino = "A" | ||
| elif triple in ['TAT','TAC']: | ||
| amino = "Y" | ||
| elif triple in ['CAT','CAC']: | ||
| amino = "H" | ||
| elif triple in ['CAA','CAG']: | ||
| amino = "Q" | ||
| elif triple in ['AAT','AAC']: | ||
| amino = "N" | ||
| elif triple in ['AAA','AAG']: | ||
| amino = "K" | ||
| elif triple in ['GAT','GAC']: | ||
| amino = "D" | ||
| elif triple in ['GAA','GAG']: | ||
| amino = "E" | ||
| elif triple in ['TGT','TGC']: | ||
| amino = "C" | ||
| elif triple in ['TGG']: | ||
| amino = "W" | ||
| elif triple in ['CGT','CGC','CGA','CGG']: | ||
| amino = "R" | ||
| elif triple in ['AGT','AGC']: | ||
| amino = "S" | ||
| elif triple in ['AGT','AGC']: | ||
| amino = "R" | ||
| elif triple in ['GGT','GGC','GGA','GGG']: | ||
| amino = "G" | ||
| # end categorization | ||
| protein = protein + amino #append amino value to protein | ||
| return protein #return string | ||
| protein = "" | ||
| protein.join(aa_table[dna[i:i+3]] for i in xrange(0, len(dna)-2, 3)) | ||
| return protein | ||
|
|
||
| def gene_finder(dna): | ||
| """ Returns the amino acid sequences that are likely coded by the specified dna | ||
|
|
@@ -278,7 +201,7 @@ def gene_finder(dna): | |
| threshold = longest_ORF_noncoding(dna,1500) | ||
| all_ORFs = find_all_ORFs_both_strands(dna) | ||
| for item in all_ORFs: | ||
| if len(item) > len(threshold): | ||
| if len(item) > threshold: | ||
| AA_list.append(coding_strand_to_AA(item)) | ||
| return AA_list | ||
|
|
||
|
|
@@ -287,7 +210,7 @@ def gene_finder(dna): | |
| contigs = load_contigs() | ||
| name,dna = contigs[5] | ||
|
|
||
| print gene_finder(dna) | ||
| gene_finder(dna) | ||
| print("---%s seconds ---" % (time.time()-start_time)) | ||
|
|
||
| if __name__ == "__main__": | ||
|
|
||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Overall - good implementation of dictionaries and list comprehensions, and general cleaning up.