-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmascot.py
462 lines (379 loc) · 17.5 KB
/
mascot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
#!/bin/env python
#
# -*- coding: utf-8 -*-
#
# Copyright (C) University of Manchester 2011
# Julian Selley <[email protected]>
################################################################################
"""
Mascot module
=============
This module contains code related to interpreting Mascot information
and files. Mascot is a piece of software produced by U{Matrix
Science(R) <http://www.matrixscience.com/>}, which is used in
Proteomics to search Mass Spectrometry data for potential peptide
identifications.
Description
-----------
Mass spectrometers take a set of molecules that have been isolated by
mass and charge, and fragment it by a high charged field. The ions
generated from the fragmentation of those molecules can be further
fragmented. If the ions that are subjected to the second level of
fragmentation originate from a peptide, they will further fragment
revealing mass / charge information related to the peptide sequence.
The spectra (the fragmentation pattern) from the second level of
fragmentation are sent to software to provide identifications on
peptide sequences, and ultimately on the proteins they belong to. In
Europe, the accepted software for doing this is Mascot, produced by a
company U{Matrix Science <http://www.matrixscience.com/>}. This
library provides an input into processing Mascot-related data.
This library provides the following functionality:
- reading the User and Group configuration files
- reading the search logs
Dependancies
------------
- In order to provide some cross-operating system functionality, this
library depends on C{os.path}, to identify the location of defaults
for the files being loaded.
- The library provides the ability to load in the logs and in order to
do this it depends on the C{csv} (Comma Separated Values) library.
- In order to manipilate the time stamps in the logs, this library
depends on the C{time} library.
- The library provides clases to load and interpret the User and Group
information. These files are XML, and thereby depend on
C{xml.dom.minidom} to facilitate their loading.
- The library provides debugging information via the C{logging}
library.
"""
# Metadata
__version__ = '1.0.1'
__author__ = 'Julian Selley <[email protected]>'
__copyright__ = 'Copyright 2011 University of Manchester, Julian Selley <[email protected]>'
__license__ = 'The Artistic License 2.0 (see the file LICENSE included with the distribution)'
# Imports
from os.path import join as pjoin
import csv
import logging
import time
import xml.dom.minidom
# Setup the logger
logger = logging.getLogger(__name__)
class Group:
"""A Mascot Group represented as a struct.
The C{struct} stores the group _ID_, _name_ and a _list of User
ID's_ associated with that group.
The C{struct} has one overridden method to facilitate a clear
print out of the data via C{print Group}.
"""
id = None #: The numerical identification for the Mascot Group
name = None #: The name of the Mascot Group
uids = [] #: A list of the user identifiers in the Mascot Group
def __repr__(self):
"""Overrides the print functionality"""
return "Group({0}): {1}, {2}".format(self.id, self.name, self.uids)
class GroupXMLInputFileReader:
"""File Reader for Mascot Group file.
This class reads the Mascot group file (an XML file), and allows for
querying the data contained therein. As the Group file is an XML file, this
code is dependent on the library xml.dom.minidom, and is also written as a
File Reader rather than as a Stream Reader object that would enable a
slightly more flexible approach to coding.
The code contained here in uses the logging module to create debugging
information.
>>> import proteomics.mascot
>>> grp_reader = proteomics.mascot.GroupXMLInputFileReader('test_data/group.xml')
>>> groups = grp_reader.readfile()
"""
def __init__(self, filename = pjoin('data', 'group.xml')):
"""Constructor.
Takes a filename (optional) detailing where the group file exists, in
order to create the object.
@param filename: the filename of the group XML to open (default: 'I{data/group.xml}')
@type filename: str
"""
self._filename = filename
def read_file(self):
"""Reads the Mascot Group file.
Reads the Mascot group file in (the filename for which the object was
created with) and obtains the information contained therein.
C{>>> groups = grp_reader.read_file()}
@return: the list of groups contained in the file.
@rtype: the list of Groups (L{Group})
"""
_grps = []
__group_doc = xml.dom.minidom.parse(self._filename) # the group DOM
__grp = None # a representation (struct) of a group
# for each "mss:group_data" node in the XML
for node in __group_doc.getElementsByTagName('mss:group_data'):
# get every child node
for childNode in node.childNodes:
# go to the next node if it isn't an ELEMENT_NODE and isn't one
# the specific informaiton of interest relating to groups
if (childNode.nodeType != childNode.ELEMENT_NODE and
childNode.localName != 'group_id' and
childNode.localName != 'group_name' and
childNode.localName != 'users'):
continue
# if the node is about the group id
if childNode.localName == 'group_id':
# if there was a previous group information, store it in
# the object
if (__grp != None):
_grps.append(__grp)
logger.debug("gid {0} stored".format(__grp.id))
logger.debug("group uids length: {0}".format(len(__grp.uids)))
# create a new group and store the group id
__grp = Group()
__grp.uids = [] # required because uids is a pointer that needs resetting
__grp.id = int(childNode.childNodes[0].data)
logger.debug("setting gid: %s",
childNode.childNodes[0].data)
# elif the node is the group name, set the group name
elif childNode.localName == 'group_name':
__grp.name = childNode.childNodes[0].data
logger.debug("(gid: %s) name defined: %s",
__grp.id, childNode.childNodes[0].data)
# elif the node is about the associated users
elif childNode.localName == 'users':
# get all the user childNodes ("mss:user_id"), and store the
# uid's
for user in childNode.childNodes:
if user.nodeType == user.ELEMENT_NODE and user.localName == "user_id":
__grp.uids.append(int(user.childNodes[0].data))
logger.debug("(gid: %s) uid added: %s",
__grp.id, user.childNodes[0].data)
# store the last group
_grps.append(__grp)
logger.debug("gid {0} stored".format(__grp.id))
# return the groups read in
return _grps
class LogEntry:
"""A Mascot Log entry represented as a struct.
A log entry (a row) from the Mascot log file, represented as each
column of the row.
"""
searchid = None
pid = None
database = None
username = None
useremail = None
title = None
filename = None
start = None
duration = None
status = None
priority = None
searchtype = None
enzyme = None
ipaddr = None
uid = None
class LogInputFileReader:
"""File Stream Reader for Mascot log files.
Reads the log file as a "stream"-type reader. It opens up the file, reads
it into memory and then uses the CSV module to parse through it. It returns
each row of the log file as a LogEntry, and works by iteration through the
file.
When implementing code using this class, it is intended that the user
implements it in a try ... except ... clause. The iteration will throw a
StopIteration when it gets to the end of the log file.
The code does some rudimentry checking on the data and casts data into
relevant data types rather than keeping them all as str. The code has also
been set up to record the number of log entries within the object. It works
with the `len` function.
>>> import proteomics.mascot
>>> logs = []
>>> log_reader = proteomics.mascot.LogInputFileReader('test_data/searches.log')
>>> for log_entry in log_reader:
... try:
... logs.append(log_entry)
... except StopIteration:
... pass
"""
def __init__(self, filename = pjoin('data', 'logs', 'searches.log')):
"""Constructor.
Takes a filename (optional) detailing where the log file exists, in
order to create the object. The constructor then loads the entire file
and parses it using the CSV module. It is the list of elements for each
row, that the module stores in memory. It also stores the number of
rows in a private variable ('_len'). Finally, it closes the file.
@param filename: the filename of the log file to open (default: 'I{data/logs/searches.log}')
@type filename: str
"""
self._rows = []
self._len = 0
self._rowi = 0
file = open(filename, 'rb')
_csv = csv.reader(file, delimiter = '\t')
for _row in _csv:
self._rows.append(_row)
self._len += 1
file.close()
def __len__(self):
"""Returns the number of log entries (rows in the file).
This method facilitates the use of the C{len} function with
this object.
"""
return self._len
def __iter__(self):
"""Allows for iteration throught the rows of the log file.
This allows a C{for} loop to be conducted on this object, as
with a list. It basically is the equivilent of implementing an
C{iterator} interface in Java.
"""
return self
def next(self):
"""Move on to the next row of the log file.
Part of the iteration interface: this method passes back the
next row of the log file, returning a LogEntry.
@return: the next log entry
@rtype: L{LogEntry} struct
"""
# check that the row index (_rowi) is not at the end of the file; raise
# an exception if it is
if self._rowi >= len(self):
raise StopIteration
# create the LogEntry object
_logentry = LogEntry()
# populate the log entry
_logentry.searchid = long(self._rows[self._rowi][0])
_logentry.pid = long(self._rows[self._rowi][1])
_logentry.database = self._rows[self._rowi][2]
_logentry.username = self._rows[self._rowi][3]
_logentry.useremail = self._rows[self._rowi][4]
_logentry.title = self._rows[self._rowi][5]
_logentry.filename = self._rows[self._rowi][6]
_logentry.start = time.strptime(self._rows[self._rowi][7], "%a %b %d %H:%M:%S %Y")
_logentry.duration = int(self._rows[self._rowi][8])
_logentry.status = self._rows[self._rowi][9]
_logentry.priority = int(self._rows[self._rowi][10])
_logentry.searchtype = self._rows[self._rowi][11]
_logentry.enzyme = self._rows[self._rowi][12]
_logentry.ipaddr = self._rows[self._rowi][13]
_logentry.uid = int(self._rows[self._rowi][14])
# increment the current row index to allow us to move to the next line
# next iteration
self._rowi += 1
# return the log entry
return _logentry
def read_file(self):
"""Reads the log file.
Reads the Mascot Log file and returns a list of log entries.
C{>>> logs = log_reader.read_file()}
@return: the list of log entries
@rtype: list of LogEntries (L{LogEntry})
"""
# sets up a list of logs
_logs = []
# iterate through and append the log_entries to the list of logs
for _log_entry in self:
# do in a try: ... except ...: loop to catch the StopIteration
try:
_logs.append(_log_entry)
except StopIteration:
pass
# return the log entries
return _logs
def reset(self, idx = 0):
"""Reset the search log row pointer.
Reset the search log pointer (back to 0 by default).
@param idx: the index to reset the pointer to (default: 0).
@type idx: int
"""
self._rowi = idx
class User:
"""A Mascot User represented as a struct.
The C{struct} stores the user _ID_, _username_, _full name_, and
_e-mail_ address.
The C{struct} has one overridden method to facilitate a clear
print out of the data via C{print User}.
"""
id = None
username = None
fullname = None
email = None
def __repr__(self):
"""Overrides the print functionality"""
return "User({0}): {1} ({2} <{3}>)".format(self.id, self.username,
self.fullname, self.email)
class UserXMLInputFileReader:
"""File Reader for Mascot User file.
This class reads the Mascot user file (an XML file), and allows for
querying the data contained therein. As the User file is an XML file, this
code is dependent on the library xml.dom.minidom, and is also written as a
File Reader rather than as a Stream Reader object that would enable a
slightly more flexible approach to coding.
The code contained here in uses the logging module to create debugging
information.
>>> import proteomics.mascot
>>> usr_reader = proteomics.mascot.UserXMLInputFileReader('test_data/user.xml')
>>> users = usr_reader.readfile()
"""
def __init__(self, filename = pjoin('data', 'user.xml')):
"""Constructor.
Takes a filename (optional) detailing where the user file exists, in
order to create the object.
@param filename: the filename of the user XML to open (default: 'I{data/user.xml}').
@type filename: str
"""
self._filename = filename
def read_file(self):
"""Reads the Mascot User file.
Reads the Mascot user file in (the filename for which the object was
created with) and obtains the information contained therein.
C{>>> users = usr_reader.read_file()}
@return: the list of users contained in the file.
@rtype: list of Users (L{User})
"""
_usrs = []
__user_doc = xml.dom.minidom.parse(self._filename) # the user DOM
__usr = None # a representation (struct) of a user
# for each "mss:user_data" node in the XML
for node in __user_doc.getElementsByTagName('mss:user_data'):
# get every child node
for childNode in node.childNodes:
# go to the next node if it isn't an ELEMENT_NODE and isn't one
# the specific informaiton of interest relating to userss
if (childNode.nodeType != childNode.ELEMENT_NODE and
childNode.localName != 'userID' and
childNode.localName != 'userName' and
childNode.localName != 'fullName' and
childNode.localName != 'emailAddress'):
continue
# if the node is about the user id
if childNode.localName == 'userID':
# if there was a previous user information, store it in
# the object
if (__usr != None):
_usrs.append(__usr)
logger.debug("uid {0} stored".format(__usr.id))
# create a new user and store the user id
__usr = User()
__usr.id = int(childNode.childNodes[0].data)
logger.debug("setting uid: %s",
childNode.childNodes[0].data)
# elif the node is the user name, set the user name
elif childNode.localName == 'userName':
__usr.username = childNode.childNodes[0].data
logger.debug("(uid: %s) name defined: %s",
__usr.id, childNode.childNodes[0].data)
# elif the node is the full name, set the user's full name
elif childNode.localName == 'fullName':
__usr.fullname = childNode.childNodes[0].data
logger.debug("(uid: %s) fullname defined: %s",
__usr.id, childNode.childNodes[0].data)
# elif the node is the email address, set the user's e-mail
elif childNode.localName == 'emailAddress':
try:
__usr.email = childNode.childNodes[0].data
logger.debug("(uid: %s) email defined: %s",
__usr.id, childNode.childNodes[0].data)
except IndexError:
pass
# store the last user
_usrs.append(__usr)
logger.debug("uid {0} stored".format(__usr.id))
# return the users read in
return _usrs
#if __name__ == '__main__':
# logging.basicConfig(level=logging.DEBUG)
#