Revisiting Genefinder #1

phuston · 2016-05-05T16:44:04Z

Overall - good implementation of dictionaries and list comprehensions, and general cleaning up.

phuston · 2016-05-05T16:42:01Z

Effective use of list comprehensions.

phuston · 2016-05-05T16:43:10Z

This is miles better than the mess of ifs and elses you had before.

-Original file line number
+Diff line change
@@ Expand Up / @@ -19,6 +19,7 @@ def shuffle_string(s): @@
     # YOU WILL START YOUR IMPLEMENTATION FROM HERE DOWN ###
+    complement_dict = {'A':'T', 'T':'A', 'C':'G', 'G':'C'}
     def get_complement(nucleotide):
         """ Returns the complementary nucleotide
@@ Expand All / @@ -32,19 +33,7 @@ def get_complement(nucleotide): @@
         >>> get_complement('G')
         'C'
         """
-        #series of if elif statements that return complementary nucleotides
-        if nucleotide == 'A':
-            return 'T'
-        elif nucleotide == 'T':
-            return 'A'
-        elif nucleotide == 'C':
-            return 'G'
-        elif nucleotide == 'G':
-            return 'C'
-        else:
-            print "Invalid Arguemnt not a nucleotide"
-            return "A"
+        return complement_dict[nucleotide]
     def get_reverse_complement(dna):
         """ Computes the reverse complementary sequence of DNA for the specfied DNA
@@ Expand All / @@ -58,12 +47,7 @@ def get_reverse_complement(dna): @@
         'TGAACGCGG'
         """
-        new_string = ""
-        for i in dna[::-1]: #iterate from the last index to the first index
-            new_string = new_string + get_complement(i)
-        return new_string
+        return ''.join(get_complement(n) for n in dna[::-1])
     def rest_of_ORF(dna):
         """ Takes a DNA sequence that is assumed to begin with a start
@@ Expand All / @@ -90,7 +74,7 @@ def rest_of_ORF(dna): @@
             if dna[i:i+3] in ['TAA','TAG','TGA']:
                 break
             #add dna triple to new_string
-            new_string= new_string + dna[i:i+3]
+            new_string = new_string + dna[i:i+3]
             #increase counter by 3 because we added triple
             i = i + 3
@@ Expand Down Expand Up / @@ -138,7 +122,6 @@ def find_all_ORFs(dna): @@
         >>> find_all_ORFs("ATGCATGAATGTAG")
         ['ATGCATGAATGTAG', 'ATGAATGTAG', 'ATG']
         """
         dna_list = []
         for i in range(3): #go through each possible frames
             dna_list = dna_list + find_all_ORFs_oneframe(dna[i:])
@@ Expand All / @@ -159,9 +142,8 @@ def find_all_ORFs_both_strands(dna): @@
         dna_list = [] #add orf's to list then return
-        for list in string_list:
-            temp_list = find_all_ORFs(list)
-            for item in temp_list:
+        for temp in string_list:
+            for item in find_all_ORFs(temp):
                 dna_list.append(item)
         return dna_list
@@ Expand All / @@ -187,14 +169,9 @@ def longest_ORF_noncoding(dna, num_trials): @@
             dna: a DNA sequence
             num_trials: the number of random shuffles
             returns: the maximum length longest ORF """
-        maximum_length = 0 #maximum length will store the strand with the maximum length
-        string = "" #this is the string that we will return
-        for i in range(num_trials): #run program num_trials times
-            shuffled = longest_ORF(shuffle_string(dna)) #get the longest string frum shuffled input
-            if len(shuffled) > maximum_length: #compare length of string to previous maximum
-                maximum_length = len(shuffled) #assign new maximum if true
-                string = shuffled
-        return string
+        return max(len(longest_ORF(shuffle_string(dna))) for test in xrange(num_trials))
     def coding_strand_to_AA(dna):
         """ Computes the Protein encoded by a sequence of DNA.  This function
@@ Expand All / @@ -210,63 +187,9 @@ def coding_strand_to_AA(dna): @@
             >>> coding_strand_to_AA("ATGCCCGCTTT")
             'MPA'
         """
-        i = 0
-        protein = "" #protein stores each amino value for triples
-        amino = "" #a value which we append to protein, will change depending on triple catigorization
-        for i in range(0,len(dna),3): #we step 3 because we want triples
-            triple = dna[i:i+3] #get the next three codons from dna
-            if len (triple) == 3: #make sure triple has length 3, appending occures at the end of this statement
-                #lots of if elif statements to categorize dna to amino acid
-                #each will assign a new value to amino
-                #yes I do know about dictionaries
-                if triple in ["TTT","TTC"]:
-                    amino = "F"
-                elif triple in ['TTA','TTG','CTT','CTC','CTA','CTG']:
-                    amino = "L"
-                elif triple in ['ATT','ATC','ATA']:
-                    amino = "I"
-                elif triple in ["ATG"]:
-                    amino = "M"
-                elif triple in ['GTT','GTC','GTA','GTG']:
-                    amino = "V"
-                elif triple in ['TCT','TCC','TCA','TCG']:
-                    amino = "S"
-                elif triple in ['CCT','CCC','CCA','CCG']:
-                    amino = "P"
-                elif triple in ['ACT','ACC','ACA','ACG']:
-                    amino = "T"
-                elif triple in ['GCT','GCC','GCA','GCG']:
-                    amino = "A"
-                elif triple in ['TAT','TAC']:
-                    amino = "Y"
-                elif triple in ['CAT','CAC']:
-                    amino = "H"
-                elif triple in ['CAA','CAG']:
-                    amino = "Q"
-                elif triple in ['AAT','AAC']:
-                    amino = "N"
-                elif triple in ['AAA','AAG']:
-                    amino = "K"
-                elif triple in ['GAT','GAC']:
-                    amino = "D"
-                elif triple in ['GAA','GAG']:
-                    amino = "E"
-                elif triple in ['TGT','TGC']:
-                    amino = "C"
-                elif triple in ['TGG']:
-                    amino = "W"
-                elif triple in ['CGT','CGC','CGA','CGG']:
-                    amino = "R"
-                elif triple in ['AGT','AGC']:
-                    amino = "S"
-                elif triple in ['AGT','AGC']:
-                    amino = "R"
-                elif triple in ['GGT','GGC','GGA','GGG']:
-                    amino = "G"
-                # end categorization
-                protein = protein + amino #append amino value to protein
-        return protein #return string
+        protein = ""
+        protein.join(aa_table[dna[i:i+3]] for i in xrange(0, len(dna)-2, 3))
+        return protein
     def gene_finder(dna):
         """ Returns the amino acid sequences that are likely coded by the specified dna
@@ Expand All / @@ -278,7 +201,7 @@ def gene_finder(dna): @@
         threshold = longest_ORF_noncoding(dna,1500)
         all_ORFs = find_all_ORFs_both_strands(dna)
         for item in all_ORFs:
-            if len(item) > len(threshold):
+            if len(item) > threshold:
                 AA_list.append(coding_strand_to_AA(item))
         return AA_list
@@ Expand All / @@ -287,7 +210,7 @@ def gene_finder(dna): @@
     contigs = load_contigs()
     name,dna = contigs[5]
-    print gene_finder(dna)
+    gene_finder(dna)
     print("---%s seconds ---" % (time.time()-start_time))
     if __name__ == "__main__":
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Revisiting Genefinder #1

Uh oh!

Diff view

Diff view

There are no files selected for viewing

phuston May 5, 2016

Uh oh!

phuston May 5, 2016

Uh oh!

phuston May 5, 2016

Uh oh!

Revisiting Genefinder #1

Are you sure you want to change the base?

Uh oh!

Revisiting Genefinder #1

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

phuston May 5, 2016

Choose a reason for hiding this comment

Uh oh!

phuston May 5, 2016

Choose a reason for hiding this comment

Uh oh!

phuston May 5, 2016

Choose a reason for hiding this comment

Uh oh!