forked from simonwhelan/prequal
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSequence.cpp
More file actions
243 lines (226 loc) · 7.47 KB
/
Sequence.cpp
File metadata and controls
243 lines (226 loc) · 7.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
/*
* Sequence.cpp
*
* Created on: Oct 13, 2017
* Author: simon
*/
#include "Sequence.h"
int CSequence::_maxLength = 0;
char CSequence::_filterOut = 'X';
using namespace::std;
//////////////// CSequence
CSequence::CSequence(std::string name, std::string seq) {
AddName(name);
AddSequence(seq);
InitialiseFlags();
}
void CSequence::AddName(std::string name) {
if(!_name.empty()) {
std::cout << "\nCSequence ERROR: Trying to added name to non-empty named sequence\n";
exit(-1);
}
_name = name;
}
void CSequence::AddSequence(std::string seq) {
if (!_seq.empty()) {
std::cout << "\nCSequence ERROR: Trying to added sequence to non-empty named sequence\n";
exit(-1);
}
_seq = seq;
if(_seq.size() > _maxLength) { _maxLength = _seq.size(); }
}
std::string CSequence::Seq(int pos, bool filter, bool showOutside) {
std::stringstream ss;
if(pos != -1) {
if(!showOutside && !Inside[pos]) { ss << "0"; }
else if(filter && Remove[pos]) { if(_filterOut != '\0') { ss <<_filterOut; } }
else { ss << _seq[pos]; }
} else {
for( int i = 0 ; i < _seq.size(); i++) {
if(!showOutside && !Inside[i]) { continue; }
if(filter && Remove[i]) { if(_filterOut != '\0') { ss <<_filterOut; } }
else { ss << _seq[i]; }
}
}
return ss.str();
}
std::string CSequence::RealSeq(int pos) {
if(pos != -1) {
return _seq.substr(pos,1);
}
return _seq;
}
bool CSequence::Filter(int pos) {
if(Remove[pos] || !Inside[pos]) { return true; }
return false;
}
string CSequence::RealDNA(int pos) { // Real codon at position pos
if(pos == -1) { return _dna_seq; }
if(pos > length()) { cout << "\nPosition must be within amino acid sequence for RealDNA output"; }
return _dna_seq.substr(pos*3,3);
}
string CSequence::DNA(int pos, bool filter, bool showOutside) { // Filtered codon at position pos
std::stringstream ss;
if(pos != -1) {
if(pos > length()) { cout << "\nPosition must be within amino acid sequence for RealDNA output"; }
if(!showOutside && !Inside[pos]) { ss << "0" << "0" << "0"; }
else if(filter && Remove[pos]) { if(_filterOut != '\0') { ss <<_filterOut << _filterOut << _filterOut; } }
else { ss << _dna_seq.substr(pos*3,3); }
} else {
for( int i = 0 ; i < length(); i++) {
if(!showOutside && !Inside[i]) { continue; }
if(filter && Remove[i]) { if(_filterOut != '\0') { ss <<_filterOut << _filterOut << _filterOut; } }
else { ss << _dna_seq.substr(i*3,3); }
}
}
return ss.str();
}
// Function that examines a sequence and looks for self repeats
// ---
// This might be better served by a suffix tree implementation, but I'm just doing the dumb thing for now
bool CSequence::CleanRepeat(int repeatLength) {
string seq = _seq;
string dna = _dna_seq;
bool foundRepeat = false;
int end;
for(int pos = 0; pos < (int) seq.size() - repeatLength; pos ++) {
string repeat = seq.substr(pos,repeatLength);
// Repeat can never contain X
if(find(repeat.begin(),repeat.end(),'X') != repeat.end()) { continue; }
size_t next_pos= pos + repeatLength;
while(next_pos != string::npos) {
next_pos = seq.find(repeat,next_pos);
if(next_pos == string::npos) { break; }
foundRepeat=true;
for(end = 0; end < (int) seq.size() - repeatLength - next_pos; end++) { // Extend the repeat while there is perfect match
if(seq[pos + end + repeatLength] != seq[next_pos + end + repeatLength]) { break; }
}
seq.erase(next_pos,repeatLength + end);
if(!dna.empty()) {
dna.erase(next_pos*3,(repeatLength + end)*3);
}
}
}
if(foundRepeat) { _seq = seq; _dna_seq = dna; }
return foundRepeat;
}
// Translation functions
bool CSequence::MakeTranslation(bool forceUniversal) {
// Check the sequence is in triplets and suitable for translation
if(length() % 3 != 0) { cout << "\nTrying to translate " << Name() << ", but not in a multiple of three\n\n"; exit(-1); }
if(forceUniversal) {
if(TryTranslation(0,true)) { return true; }
} else {
for(int i = 0; i < NumGenCode; i++) {
if(TryTranslation(i)) { return true; }
}
}
return false;
}
bool CSequence::TryTranslation(int genCode, bool force) {
if(_genCode != -1) { cout << "\nTrying translation when sequence is already translated\n"; exit(-1); }
string codon;
string aa_seq;
bool okay;
int cod_num;
for(int i = 0 ; i < length(); i+=3) {
codon = _seq.substr(i,3);
okay = true;
for(auto &s : codon) {
if(s == '?' || toupper(s )== 'X' || toupper(s) == 'N') { // Checks for ambiguity characters, but may miss some weirder ones
okay = false; aa_seq.push_back('X'); break;
} else if(IsGap(s)) { // If it's a gap then push out a gap
okay = false; aa_seq.push_back('-'); break;
}
} // Check whether codon can be translated
if(!okay) { continue; }
cod_num = GenCodes[genCode][GetCodon(codon)];
if(cod_num == -1) {
if(force) { aa_seq.push_back('X'); continue; }
else { // Last codon or force allowed to be a stop codon
if(i + 3 >= length()) {
_seq.erase(i,3);
warningStream << "\nWARNING: " << Name() << " has has stop codon at end of sequence removed";
break;
}
else { return false; }
}
}
aa_seq.push_back(AA_ABET[cod_num]);
}
_dna_seq = _seq;
_seq = aa_seq;
_genCode = genCode;
return true;
}
/////////////// Minor functions
// File reader
std::vector <CSequence> *FASTAReader(std::string SeqFile, bool forceUniversal) {
std::vector <CSequence> *RetSeq = new std::vector<CSequence>();
std::ifstream input(SeqFile.c_str(), std::ifstream::in);
if(!input.good()){
std::cerr << "Error opening '"<< SeqFile <<"'. Provide a valid file." << std::endl;
exit(-1);
}
std::string line, name, content;
while(getline( input, line ) ){
line = RemoveWhiteSpace(line);
if(line.empty()) { continue; } // Skip blank lines
if(line[0] == '>' ){ // Identifier marker
if( !line.empty() ){
if(!name.empty()) { // Push the sequence back
RetSeq->push_back(CSequence(RemoveWhiteSpace(name),RemoveWhiteSpace(content)));
name.clear(); content.clear();
}
name = line.substr(1);
}
content.clear();
} else if( !name.empty() ){
content += RemoveWhiteSpace(line);
}
}
// Add the final sequence
RetSeq->push_back(CSequence(RemoveWhiteSpace(name),RemoveWhiteSpace(content)));
// Check whether all DNA
bool allDNA = true;
for(auto & seq : *RetSeq) {
for(auto & s : seq.Seq()) {
if(IsGap(s)) { continue; }
if(!IsDNA(s)) { allDNA = false; break; }
}
if(!allDNA) { break; }
}
if(allDNA) {
cout << "\nFound only DNA sequences. Doing translations.";
for(auto & seq : *RetSeq) {
if(!seq.MakeTranslation(forceUniversal)) {
cout << "\nFound DNA sequences, but cannot find a successful translation... abandoning!";
cout << "\nSequence failed: " << seq.Name() << "\n\n"; exit(-1);
}
}
}
return RetSeq;
}
std::string RemoveWhiteSpace(std::string s) {
s.erase( std::remove_if( s.begin(), s.end(), ::isspace ), s.end() );
return s;
}
std::vector <std::string> Tokenise(std::string line) {
std::string buf;
std::stringstream in(line);
std::vector <std::string> Toks;
Toks.~vector();
while(in >> buf) { Toks.push_back(buf); }
return Toks;
}
std::vector <std::string> Tokenise(std::string line, std::string Delim) {
size_t i = 0, j,j1;
std::vector <std::string> Toks;
while(i != (int)line.size()) {
j = line.find(Delim,i+1);
if(j == std::string::npos) { j = j1 = (int)line.size(); } else { j1 = j+1; }
Toks.push_back(line.substr(i,j-i));
i = j1;
}
return Toks;
}