-
Notifications
You must be signed in to change notification settings - Fork 18
Aditya Kaushika Mini Project One #10
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,37 +2,48 @@ | |
| """ | ||
| YOUR HEADER COMMENT HERE | ||
|
|
||
| @author: YOUR NAME HERE | ||
| @author: Aditya Kaushika | ||
|
|
||
| """ | ||
|
|
||
| import random | ||
| from amino_acids import aa, codons, aa_table # you may find these useful | ||
| from load import load_seq | ||
|
|
||
| def count_v1(dna, base): | ||
| dna = list(dna) # convert string to list of letters | ||
| i = 0 # counter | ||
| for c in dna: | ||
| if c == base: | ||
| i += 1 | ||
| return i | ||
|
|
||
|
|
||
| def shuffle_string(s): | ||
| """Shuffles the characters in the input string | ||
| NOTE: this is a helper function, you do not | ||
| have to modify this in any way """ | ||
| return ''.join(random.sample(s, len(s))) | ||
| return ''.join(random.sample(s, len(s))) # allows us to shuffle the letters in the code | ||
|
|
||
| # YOU WILL START YOUR IMPLEMENTATION FROM HERE DOWN ### | ||
|
|
||
|
|
||
| def get_complement(nucleotide): | ||
| """ Returns the complementary nucleotide | ||
|
|
||
| nucleotide: a nucleotide (A, C, G, or T) represented as a string | ||
| """ Returns the complementary nucleotide nucleotide: a nucleotide (A, C, G, or T) represented as a string | ||
| returns: the complementary nucleotide | ||
| >>> get_complement('A') | ||
| 'T' | ||
| >>> get_complement('C') | ||
| 'G' | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
|
|
||
| for letter in nucleotide: #Go through every letter | ||
| if letter == 'A': #If Letter is A: | ||
| return 'T' #Then complement it with T | ||
| elif letter == 'T': #Repeat with different letters. | ||
| return 'A' | ||
| elif letter == 'G': | ||
| return 'C' | ||
| else: | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this function will never fail if the user gives "A", "C', "G", "T" only as inputs. But what if I give "S" as input? |
||
| return 'G' | ||
|
|
||
| def get_reverse_complement(dna): | ||
| """ Computes the reverse complementary sequence of DNA for the specfied DNA | ||
|
|
@@ -45,25 +56,30 @@ def get_reverse_complement(dna): | |
| >>> get_reverse_complement("CCGCGTTCA") | ||
| 'TGAACGCGG' | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
| Dna_reversed = dna[::-1] #Start counting backwards | ||
| d='' #Create new sting | ||
| for letter in Dna_reversed: #Look at each backwards letter | ||
| reverse = get_complement(letter) #Get the complement of that letter | ||
| d+= reverse #Add it to our new list | ||
| return d #Visualize our list | ||
|
|
||
|
|
||
| def rest_of_ORF(dna): | ||
| """ Takes a DNA sequence that is assumed to begin with a start | ||
| codon and returns the sequence up to but not including the | ||
| first in frame stop codon. If there is no in frame stop codon, | ||
| returns the whole string. | ||
|
|
||
| dna: a DNA sequence | ||
| returns: the open reading frame represented as a string | ||
| >>> rest_of_ORF("ATGTGAA") | ||
| 'ATG' | ||
| >>> rest_of_ORF("ATGAGATAGG") | ||
| 'ATGAGA' | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
| for i in range(0,len(dna),3): #look at the range from the 0th place to the 2nd place | ||
| if (dna[i:i+3]== "TAA") or (dna[i:i+3]== "TAG") or (dna[i:i+3]== "TGA") : #Identify the stop codons | ||
| return dna[0:i] #Stop if you see a stop codon | ||
| return dna #Display results | ||
|
|
||
|
|
||
| def find_all_ORFs_oneframe(dna): | ||
|
|
@@ -79,8 +95,18 @@ def find_all_ORFs_oneframe(dna): | |
| >>> find_all_ORFs_oneframe("ATGCATGAATGTAGATAGATGTGCCC") | ||
| ['ATGCATGAATGTAGA', 'ATGTGCCC'] | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
| a=[] #Create List | ||
| rol=0 #start our remaining letters at 0 | ||
| for i in range(0,len(dna),3): # Look at three letters at a time | ||
| codon= dna[i:i+3] #assign value to codon | ||
| if (codon =='ATG' and rol<=0): #Restrictions for our if statement | ||
| orf= rest_of_ORF(dna[i:]) #assign value to orf | ||
| a.append(orf) #add on to our list | ||
| rol=len(orf) #add letters to our remaining letters | ||
| if (rol > 0): #ask if there are any remaining letters | ||
| rol = rol - 3 #if there are, take three away | ||
| return a #Display results | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like how you are commenting every line to help the reader understand what is going on! It is generally good practice to write readable code. But sometimes, especially when the code is simple and trivial, we don't need to leave comments for "every line" if it's taking too much time. |
||
|
|
||
|
|
||
|
|
||
| def find_all_ORFs(dna): | ||
|
|
@@ -96,42 +122,68 @@ def find_all_ORFs(dna): | |
| >>> find_all_ORFs("ATGCATGAATGTAG") | ||
| ['ATGCATGAATGTAG', 'ATGAATGTAG', 'ATG'] | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
| a=[] #New List | ||
| for i in range(3): #Look at all three frams, instead of one | ||
| a.extend(find_all_ORFs_oneframe(dna[i:])) #Add onto list a | ||
|
|
||
| # for i in range(0,len(dna),3): | ||
| # if (dna[i:i+3]=='ATG'): | ||
| # a.append(rest_of_ORF(dna[i:])) | ||
| # elif (dna[i+1:i+4]=='ATG'): | ||
| # a.append(rest_of_ORF(dna[i:])) | ||
| # elif (dna[i+2:i+5]=='ATG'): | ||
| # a.append(rest_of_ORF(dna[i:])) | ||
| return a #Display results | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When submitting your final code, please removed commented(deprecated) lines. |
||
|
|
||
|
|
||
| def find_all_ORFs_both_strands(dna): | ||
| """ Finds all non-nested open reading frames in the given DNA sequence on both | ||
| strands. | ||
|
|
||
| dna: a DNA sequence | ||
| returns: a list of non-nested ORFs | ||
| >>> find_all_ORFs_both_strands("ATGCGAATGTAGCATCAAA") | ||
| ['ATGCGAATG', 'ATGCTACATTCGCAT'] | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
|
|
||
| a=[] #New List | ||
| a.extend(find_all_ORFs(dna)) #add last functions results to new list | ||
| dna_reversed = dna[::-1] #Reverse direction of letters | ||
| d="" #New String | ||
| for letter in dna_reversed: #look at each letter in new variable | ||
| reverse = get_complement(letter) #get the complement of reversed letters | ||
| d+= reverse #add onto the string d | ||
| a.extend(find_all_ORFs(d)) #add onto the list a | ||
| return a #display results | ||
|
|
||
| def longest_ORF(dna): | ||
| """ Finds the longest ORF on both strands of the specified DNA and returns it | ||
| as a string | ||
| >>> longest_ORF("ATGCGAATGTAGCATCAAA") | ||
| 'ATGCTACATTCGCAT' | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
|
|
||
| count=0 #Start Count at 0 | ||
| for i in find_all_ORFs_both_strands(dna): #For items in the list created above | ||
| if len(i)>count: #Check their length to the last counted string | ||
| count = len (i) #If it is the longest, change it | ||
| a = i #make a the largest string | ||
| return a #Display a | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just a note on naming variables. Naming it Here's an article that's worth skimming through. http://archive.oreilly.com/pub/post/the_worlds_two_worst_variable.html |
||
|
|
||
| def longest_ORF_noncoding(dna, num_trials): | ||
| """ Computes the maximum length of the longest ORF over num_trials shuffles | ||
| of the specfied DNA sequence | ||
|
|
||
| dna: a DNA sequence | ||
| num_trials: the number of random shuffles | ||
| returns: the maximum length longest ORF """ | ||
| # TODO: implement this | ||
| pass | ||
| returns: the maximum length longest ORF | ||
| """ | ||
| List_of_Longest_Orfs=[] #New List | ||
| count = 0 #Start the count at 0 | ||
| for i in range(num_trials): #Tells the for statement how many times to run | ||
| shuffled_dna = longest_ORF(shuffle_string(dna[i:])) #shuffle the DNA string | ||
| List_of_Longest_Orfs.append(shuffled_dna)#add the shuffled DNA into a list | ||
| for i in (List_of_Longest_Orfs): #function to look into the afformentioned list | ||
| if len(i)>count: #Compare the length to the count of the previous word | ||
| count = len (i) #make the longer length = to the new count | ||
| List_of_Longest_Orfs = i #Tell the function where to get its list | ||
| return count #Display the results for us to see | ||
|
|
||
|
|
||
| def coding_strand_to_AA(dna): | ||
|
|
@@ -142,25 +194,34 @@ def coding_strand_to_AA(dna): | |
| dna: a DNA sequence represented as a string | ||
| returns: a string containing the sequence of amino acids encoded by the | ||
| the input DNA fragment | ||
|
|
||
| >>> coding_strand_to_AA("ATGCGA") | ||
| 'MR' | ||
| >>> coding_strand_to_AA("ATGCCCGCTTT") | ||
| 'MPA' | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
|
|
||
| a="" #New List for results to be put into | ||
| for i in range(0,len(dna),3): #Tell the function how long to look for | ||
| codon = dna[i:i+3] #Tell the function where to look | ||
| amino_acid = aa_table[codon] #assign value to amino_acid | ||
| a = a + amino_acid #continue the string | ||
| return a #actually return the amino acid | ||
|
|
||
| def gene_finder(dna): | ||
| """ Returns the amino acid sequences that are likely coded by the specified dna | ||
|
|
||
| dna: a DNA sequence | ||
| returns: a list of all amino acid sequences coded by the sequence dna. | ||
| """ | ||
| # TODO: implement this | ||
| pass | ||
| a=[] #creating an empty list | ||
| threshold = longest_ORF_noncoding(dna,1500) #Assign Value to threshold | ||
| Long_Orfs = len(longest_ORF(dna)) #assign value to Long_Orfs | ||
| if Long_Orfs>threshold): #Compare values | ||
| a.append(coding_strand_to_AA(dna)) #add to the list | ||
| dna = load_seq("./data/X73525.fa") #obtaining genes | ||
| print gene_finder(dna) #showing the list of Amino Acids | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a recursive function, but we don't really need recursion here. |
||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| import doctest | ||
| doctest.testmod() | ||
| doctest.run_docstring_examples(coding_strand_to_AA, globals(),verbose=True) | ||
| # doctest. | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For next mini project, please write a short description about the code (header comment really helps when your code base gets large)