-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_examples.py
More file actions
166 lines (139 loc) · 7.23 KB
/
create_examples.py
File metadata and controls
166 lines (139 loc) · 7.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""
Script to generate example cancer mutation FASTA files
Run this to create sample data for testing the DNA sonification platform
"""
from pathlib import Path
def create_example_fasta_files():
"""Create example FASTA files for common cancer mutations"""
# Create examples directory
examples_dir = Path("examples")
examples_dir.mkdir(exist_ok=True)
# TP53 R273H mutation (hotspot in DNA-binding domain)
tp53_wt = """>TP53_wildtype DNA-binding domain
ATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCA
GACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATG
GATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCA
GATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCCCGTGGCCCCTGCACCAGCAGCTCCT
ACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCT"""
tp53_mut = """>TP53_R273H_mutant DNA-binding domain with R273H mutation
ATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCA
GACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATG
GATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCA
GATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCGCGTGGCCCCTGCACCAGCAGCTCCT
ACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCT"""
# KRAS G12D mutation (codon 12 mutation)
kras_wt = """>KRAS_wildtype codon 12 region
ATGACTGAATATAAACTTGTGGTAGTTGGAGCTGGTGGCGTAGGCAAGAGTGCCTTGACG
ATACAGCTAATTCAGAATCATTTTGTGGACGAATATGATCCAACAATAGAGGATTCCTAC
AGGAAGCAAGTAGTAATTGATGGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGT
CAAGAGGAGTACAGTGCAATGAGGGACCAGTACATGAGGACTGGGGAGGGCTTTCTTTGT
GTATTTGCCATAAATAATACTAAATCATTTGAAGATATTCACCATTATAGAGAACAAATT
AAAAGAGTTAAGGACTCTGAAGATGTACCTATGGTCCTAGTAGGAAATAAATGTGATTTG"""
kras_mut = """>KRAS_G12D_mutant codon 12 G12D activating mutation
ATGACTGAATATAAACTTGTGGTAGTTGGAGCTGGTGGCGTAGGCAAGAGTGCCTTGACG
ATACAGCTAATTCAGAATCATTTTGTGGACGAATATGATCCAACAATAGAGGATTCCTAC
AGGAAGCAAGTAGTAATTGATGGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGT
CAAGAGGAGTACAGTGCAATGAGGGACCAGTACATGAGGACTGGGGAGGGCTTTCTTTGT
GTATTTGCCATAAATAATACTAAATCATTTGAAGATATTCACCATTATAGAGAACAAATT
AAAAGAGTTAAGGACTCTGAAGATGTACCTATGGTCCTAGTAGGAAATAAATGTGATTTG"""
# BRAF V600E mutation (common in melanoma)
braf_wt = """>BRAF_wildtype V600 region kinase domain
ATGAAGACCTCACAGTAAAAATAGGTGATTTTGGTCTAGCTACAGTGAAATCTCGATGGA
GTGGGTCCCATCAGTTTGAACAGTTGTCTGGATCCATTTTGTGGATGGTAAGAATTGAGG
CTATTTTTCCACTGATTAAATTTTTGGCCCTGAGATGCTGCTGAGTTACTAGAAAGTCAT
TTATGAGGACTTGGATATTTTATTTGGTGACTGATATTAGCATTCAGAATGAGTTGCAAC
TAGAAATTATAAGAGAATTCAGATACATTTCTTGATTGAAGACTTAAAATTCTTTCTAAC
TCTTCAGGCATGAAGGTACTCAGCAATGGTTACACAGACCTAGCAGATATTGTTGAAGAC"""
braf_mut = """>BRAF_V600E_mutant V600E activating mutation
ATGAAGACCTCACAGTAAAAATAGGTGATTTTGGTCTAGCTACAGTGAAATCTCGATGGA
GTGGGTCCCATCAGTTTGAACAGTTGTCTGGATCCATTTTGTGGATGGTAAGAATTGAGG
CTATTTTTCCACTGATTAAATTTTTGGCCCTGAGATGCTGCTGAGTTACTAGAAAGTCAT
TTATGAGGACTTGGATATTTTATTTGGTGACTGATATTAGCATTCAGAATGAGTTGCAAC
TAGAAATTATAAGAGAATTCAGATACATTTCTTGATTGAAGACTTAAAATTCTTTCTAAC
TCTTCAGGCATGAAGGTACTCAGCAATGGTTACACAGACCTAGAAGATATTGTTGAAGAC"""
# EGFR L858R mutation (non-small cell lung cancer)
egfr_wt = """>EGFR_wildtype L858 region tyrosine kinase domain
ATCAAGAATCAGGAGAAAGCTGCCAGCGCCTTCTCAAGCCTCTATGCCTTGGCCAACAGG
CCCCAGATGCTCAATTCCATCATGGACCCGCCATACTCGGAGGCCAAGGTGGGCTTTGGG
GACATCCTGCAGGCCCGCTGCTCCCAGTGCCGGGATGCGGTCCCCCCCAGTGTCGTGAAG
GCGGAGGATGTCCTGGAGAAGGAGATCCTGCCCCAGTGGGTCAGCGTGCTGCGGGCCATC
TATGAGAACCGGAGTGACACGCTGGATGCTGAGGAGACCCTGCAGCGGCTGGGGCCCTTT"""
egfr_mut = """>EGFR_L858R_mutant L858R activating mutation
ATCAAGAATCAGGAGAAAGCTGCCAGCGCCTTCTCAAGCCTCTATGCCTTGGCCAACAGG
CCCCAGATGCTCAATTCCATCATGGACCCGCCATACTCGGAGGCCAAGGTGGGCTTTGGG
GACATCCTGCAGGCCCGCTGCTCCCAGTGCCGGGATGCGGTCCCCCCCAGTGTCGTGAAG
GCGGAGGATGTCCTGGAGAAGGAGATCCTGCCCCAGTGGGTCAGCGTGCTGCGGGCCATC
TATGAGAACCGGAGTGACACGCTGGATGCTGAGGAGACCAGACAGCGGCTGGGGCCCTTT"""
# PIK3CA H1047R mutation (breast cancer)
pik3ca_wt = """>PIK3CA_wildtype H1047 region helical domain
ATGAAAGCAACTCAGCAATCAGCCCATACAGGTTATTGGCCATTTTTGGTCCAATGATGC
TTGGCTCTGGAATGCCAGAACTACAATCTTTTGAAGGAAATGAATGATGCACATCATGGT
GGCTGGACAACAAAAGGGCAATCAGTTACTTTTCTCCACAGATATCATTGCAAATAGTCA
AGAGTATTGGCTTATGGATTTACAACATATTGATTGTGCATTGATGGCATGGTATATTCA
TGATGCAGAGACGATTGATTTGTGGAAATTGGCTTGGGCACTTACATTGCTGAAGTGAAA"""
pik3ca_mut = """>PIK3CA_H1047R_mutant H1047R gain-of-function mutation
ATGAAAGCAACTCAGCAATCAGCCCATACAGGTTATTGGCCATTTTTGGTCCAATGATGC
TTGGCTCTGGAATGCCAGAACTACAATCTTTTGAAGGAAATGAATGATGCACATCATGGT
GGCTGGACAACAAAAGGGCAATCAGTTACTTTTCTCCACAGATATCATTGCAAATAGTCA
AGAGTATTGGCTTATGGATTTACAACATATTGATTGTGCATTGATGGCATGGTATATTCA
TGATGCAGAGACGATTGATTTGTGGAAATTGGCTTGGGCGCTTACATTGCTGAAGTGAAA"""
# Write files
files = {
"TP53_wildtype.fasta": tp53_wt,
"TP53_R273H_mutant.fasta": tp53_mut,
"KRAS_wildtype.fasta": kras_wt,
"KRAS_G12D_mutant.fasta": kras_mut,
"BRAF_wildtype.fasta": braf_wt,
"BRAF_V600E_mutant.fasta": braf_mut,
"EGFR_wildtype.fasta": egfr_wt,
"EGFR_L858R_mutant.fasta": egfr_mut,
"PIK3CA_wildtype.fasta": pik3ca_wt,
"PIK3CA_H1047R_mutant.fasta": pik3ca_mut
}
for filename, content in files.items():
filepath = examples_dir / filename
with open(filepath, 'w') as f:
f.write(content)
print(f"Created: {filepath}")
# Create a README
readme = """# Cancer Mutation Example FASTA Files
This directory contains example DNA sequences for common cancer-related mutations:
## TP53 (Tumor Suppressor)
- **TP53_wildtype.fasta**: Normal DNA-binding domain
- **TP53_R273H_mutant.fasta**: R273H hotspot mutation (CGC→CAC, Arg→His)
- Common in colorectal, lung, and breast cancers
- Loss of tumor suppressor function
## KRAS (Oncogene)
- **KRAS_wildtype.fasta**: Normal codon 12 region
- **KRAS_G12D_mutant.fasta**: G12D activating mutation (GGT→GAT, Gly→Asp)
- Common in pancreatic, colorectal, and lung cancers
- Constitutive activation of signaling
## BRAF (Oncogene)
- **BRAF_wildtype.fasta**: Normal V600 region
- **BRAF_V600E_mutant.fasta**: V600E activating mutation (GTG→GAG, Val→Glu)
- Common in melanoma and thyroid cancer
- Constitutive kinase activation
## EGFR (Receptor Tyrosine Kinase)
- **EGFR_wildtype.fasta**: Normal L858 region
- **EGFR_L858R_mutant.fasta**: L858R activating mutation (CTG→CGG, Leu→Arg)
- Common in non-small cell lung cancer
- Increased kinase activity
## PIK3CA (Oncogene)
- **PIK3CA_wildtype.fasta**: Normal H1047 region
- **PIK3CA_H1047R_mutant.fasta**: H1047R gain-of-function mutation (CAT→CGT, His→Arg)
- Common in breast, colorectal, and endometrial cancers
- Enhanced PI3K pathway activation
## Usage
Upload pairs of wildtype and mutant files to the DNA Sonification Platform to:
1. Visualize sequence alignments
2. Hear mutations as distinct sound patterns
3. Analyze functional impact with AI
"""
readme_path = examples_dir / "README.md"
with open(readme_path, 'w', encoding='utf-8') as f:
f.write(readme)
print(f"\nCreated: {readme_path}")
print(f"\n✓ Created {len(files)} FASTA files in {examples_dir}/")
print("These files are ready to use with the DNA Sonification Platform!")
if __name__ == "__main__":
create_example_fasta_files()