forked from Nesvilab/philosopher
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathphilosopher.yml
308 lines (295 loc) · 30.3 KB
/
philosopher.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
# Philosopher pipeline configuration file
# Version 4.1.1
#
# The pipeline mode automates the processing done by Philosopher and other tools. First, check
# the steps you want to execute in the commands section and change them to
# 'yes'. For each selected command, go to its section and adjust the parameters
# accordingly to your analysis.
#
# If you want to include MSFragger and TMT-Integrator into your analysis, you will
# have to download them separately, and then add their location in their
# configuration section.
#
# Usage:
# philosopher pipeline --config <this_configuration_file> [list_of_data_set_folders]
analytics: true # reports when a workspace is created for usage statistics
slackToken: # specify the Slack API token (how to generate a token: https://api.slack.com/legacy/custom-integrations/legacy-tokens)
slackChannel: # specify the channel name, or
slackUserID: # specify a user ID for a direct message
Steps:
Database Search: yes # peptide to spectrum matching with Comet or MSFragger
Peptide Validation: no # peptide assignment validation with PeptideProphet
PTM Localization: no # PTM site localization with PTMProphet
Protein Inference: no # protein identification validation with ProteinProphet
Label-Free Quantification: no # precursor label-free quantification inspired by moFF
Isobaric Quantification: no # isobaric labeling-based relative quantification for TMT and iTRAQ
Bio Cluster Quantification: no # protein report based on Uniprot protein clusters
FDR Filtering: no # statistical filtering, validation and false discovery r ates assessment
Individual Reports: no # multi-level reporting for both narrow-searches and open-searches
Integrated Reports: no # combined analysis of LC-MS/MS results inspired by Abacus
Integrated Isobaric Quantification: no # integrates channel abundances from multiple isobaric-tagged samples with TMT-Integrator
Database Search: # MSFragger-3.5 & Comet v2019011
protein_database: # path to the target-decoy protein database
decoy_tag: rev_ # prefix tag used added to decoy sequences
contaminant_tag: false # prefix tag used added to decoy sequences
search_engine: msfragger # search engine options include "comet" and "msfragger"
comet: # Comet v2019011
noindex: true # skip mzML file indexing
param: # comet parameter file (default "comet.params.txt")
extension: mzML # format of the spectra file
msfragger: # MSFragger v3.5
path: # path to MSFragger jar
memory: 8 # how much memory in GB to use
param: # MSFragger parameter file
extension: mzML # spectra format
num_threads: 0 # number of CPU threads to use. 0=poll CPU to set num threads
precursor_mass_lower: -20 # lower bound of the precursor mass window
precursor_mass_upper: 20 # upper bound of the precursor mass window
precursor_mass_units: 1 # 0=Daltons, 1=ppm
data_type: 0 # 0 for DDA, 1 for DIA, 2 for DIA-narrow-window
precursor_true_tolerance: 20 # true precursor mass tolerance (window is +/- this value)
precursor_true_units: 1 # 0=Daltons, 1=ppm
fragment_mass_tolerance: 20 # fragment mass tolerance (window is +/- this value)
fragment_mass_units: 1 # fragment mass tolerance units (0 for Da, 1 for ppm)
calibrate_mass: 2 # 0=Off, 1=On, 2=On and find optimal parameters
use_all_mods_in_first_search: 0 # use all variable modifications in first search (0 for No, 1 for Yes).
write_calibrated_mgf: 0 # write calibrated MS2 scan to a MGF file (0 for No, 1 for Yes)
isotope_error: 0/1/2 # 0=off, 0/1/2 (standard C13 "error")
mass_offsets: 0 # allow for additional precursor mass window shifts. Multiplexed with isotope_error. mass_offsets = 0/79.966 can be used as a restricted ‘open’ search that looks for unmodified and phosphorylated peptides (on any residue)
restrict_deltamass_to: all # specify amino acids on which delta masses (mass offsets or search modifications) can occur. Allowed values are single letter codes (e.g. ACD), must
precursor_mass_mode: selected # one of isolated/selected/corrected.
localize_delta_mass: 0 # this allows shifted fragment ions - fragment ions with mass increased by the calculated mass difference, to be included in scoring
delta_mass_exclude_ranges: (-1.5,3.5) # exclude mass range for shifted ions searching
fragment_ion_series: b,y # ion series used in search
ion_series_definitions: # user defined ion series. (Example: b* N -17.026548;b0 N -18.010565)
search_enzyme_name_1: Trypsin # Name of the first enzyme.
search_enzyme_cut_1: KR # First enzyme's cutting amino acid.
search_enzyme_nocut_1: P # First enzyme's protecting amino acid.
allowed_missed_cleavage_1: 2 # First enzyme's allowed number of missed cleavages per peptide. Maximum value is 5.
search_enzyme_sense_1: C # First enzyme's cutting terminal.
search_enzyme_name_2: # Name of the second enzyme.
search_enzyme_cut_2: # Second enzyme's cutting amino acid.
search_enzyme_nocut_2: # Second enzyme's protecting amino acid.
allowed_missed_cleavage_2: # Second enzyme's allowed number of missed cleavages per peptide. Maximum value is 5.
search_enzyme_sense_2: C # Second enzyme's cutting terminal.
num_enzyme_termini: 2 # 2 for enzymatic, 1 for semi-enzymatic, 0 for nonspecific digestion
clip_nTerm_M: 1 # specifies the trimming of a protein N-terminal methionine as a variable modification (0 or 1)
variable_mod_01: 15.99490 M 3 # variable modification
variable_mod_02: 42.01060 [^ 1 # variable modification
variable_mod_03: # variable modification
variable_mod_04: # variable modification
variable_mod_05: # variable modification
variable_mod_06: # variable modification
variable_mod_07: # variable modification
allow_multiple_variable_mods_on_residue: 0 # static mods are not considered
max_variable_mods_per_peptide: 3 # maximum of 5
max_variable_mods_combinations: 5000 # maximum of 65534, limits number of modified peptides generated from sequence
mass_diff_to_variable_mod: 0 # put mass diff as a variable modification. 0 for no; 1 for yes and change the original mass diff and the calculated mass accordingly; 2 for yes but do not change the original mass diff.
output_file_extension: pepXML # file extension of output files
output_format: pepXML # file format of output files (pepXML or tsv)
output_report_topN: 1 # reports top N PSMs per input spectrum
output_max_expect: 50 # suppresses reporting of PSM if top hit has expectation greater than this threshold
report_alternative_proteins: 0 # 0=no, 1=yes
precursor_charge: 1 4 # assume range of potential precursor charge states. Only relevant when override_charge is set to 1
override_charge: 0 # 0=no, 1=yes to override existing precursor charge states with precursor_charge parameter
digest_min_length: 7 # minimum length of peptides to be generated during in-silico digestion
digest_max_length: 50 # maximum length of peptides to be generated during in-silico digestion
digest_mass_range: 500.0 5000.0 # mass range of peptides to be generated during in-silico digestion in Daltons
max_fragment_charge: 2 # maximum charge state for theoretical fragments to match (1-4)
track_zero_topN: 0 # in addition to topN results, keep track of top results in zero bin
zero_bin_accept_expect: 0 # boost top zero bin entry to top if it has expect under 0.01 - set to 0 to disable
zero_bin_mult_expect: 1 # disabled if above passes - multiply expect of zero bin for ordering purposes (does not affect reported expect)
add_topN_complementary: 0 # inserts complementary ions corresponding to the top N most intense fragments in each experimental spectra
check_spectral_files: 1 # check the spectral files before searching.
minimum_peaks: 15 # required minimum number of peaks in spectrum to search (default 10)
use_topN_peaks: 150 # pre-process experimental spectrum to only use top N peaks
deisotope: 1 # activates deisotoping.
deneutralloss: 1 # performs deneutrallossing or not (0=no, 1=yes)
min_fragments_modelling: 2 # minimum number of matched peaks in PSM for inclusion in statistical modeling
min_matched_fragments: 4 # minimum number of matched peaks for PSM to be reported
minimum_ratio: 0.01 # filters out all peaks in experimental spectrum less intense than this multiple of the base peak intensity
clear_mz_range: 0.0 0.0 # for iTRAQ/TMT type data; will clear out all peaks in the specified m/z range
remove_precursor_peak: 0 # remove precursor peaks from tandem mass spectra. 0=not remove; 1=remove the peak with precursor charge; 2=remove the peaks with all charge states.
remove_precursor_range: -1.5,1.5 # m/z range in removing precursor peaks. Unit: Da.
intensity_transform: 0 # transform peaks intensities with sqrt root. 0=not transform; 1=transform using sqrt root.
labile_search_mode: off # type of search (nglycan, labile, or off). Off means non-labile/typical search.
diagnostic_intensity_filter: 0 # [nglycan/labile search_mode only]. Minimum relative intensity for SUM of all detected oxonium ions to achieve for spectrum to contain diagnostic fragment evidence. Calculated relative to spectrum base peak. 0 <= value.
Y_type_masses: # [nglycan/labile search_mode only]. Specify fragments of labile mods that are commonly retained on intact peptides (e.g. Y ions for glycans). Only used if 'Y' is included in fragment_ion_series.
diagnostic_fragments: # [nglycan/labile search_mode only]. Specify diagnostic fragments of labile mods that appear in the low m/z region. Only used if diagnostic_intensity_filter > 0.
add_Cterm_peptide: 0.000000 # c-term peptide fixed modifications
add_Cterm_protein: 0.000000 # c-term protein fixed modifications
add_Nterm_peptide: 0.000000 # n-term peptide fixed modifications
add_Nterm_protein: 0.000000 # n-term protein fixed modifications
add_A_alanine: 0.000000 # alanine fixed modifications
add_C_cysteine: 57.021464 # cysteine fixed modifications
add_D_aspartic_acid: 0.000000 # aspartic acid fixed modifications
add_E_glutamic_acid: 0.000000 # glutamic acid fixed modifications
add_F_phenylalanine: 0.000000 # phenylalanine fixed modifications
add_G_glycine: 0.000000 # glycine fixed modifications
add_H_histidine: 0.000000 # histidine fixed modifications
add_I_isoleucine: 0.000000 # isoleucine fixed modifications
add_K_lysine: 0.000000 # lysine fixed modifications
add_L_leucine: 0.000000 # leucine fixed modifications
add_M_methionine: 0.000000 # methionine fixed modifications
add_N_asparagine: 0.000000 # asparagine fixed modifications
add_P_proline: 0.000000 # proline fixed modifications
add_Q_glutamine: 0.000000 # glutamine fixed modifications
add_R_arginine: 0.000000 # arginine fixed modifications
add_S_serine: 0.000000 # serine fixed modifications
add_T_threonine: 0.000000 # threonine fixed modifications
add_V_valine: 0.000000 # valine fixed modifications
add_W_tryptophan: 0.000000 # tryptophan fixed modifications
add_Y_tyrosine: 0.000000 # tyrosine fixed modifications
Peptide Validation: # PeptideProphet v5.2
concurrent: false # Concurrent execution of multiple instaces
extension: pepXML # pepXML file extension
clevel: 0 # set Conservative Level in neg_stdev from the neg_mean, low numbers are less conservative, high numbers are more conservative
accmass: true # use Accurate Mass model binning
decoyprobs: true # compute possible non-zero probabilities for Decoy entries on the last iteration
enzyme: trypsin # enzyme used in sample (optional)
exclude: false # exclude deltaCn*, Mascot*, and Comet* results from results (default Penalize * results)
expectscore: true # use expectation value as the only contributor to the f-value for modeling
forcedistr: false # bypass quality control checks, report model despite bad modeling
glyc: false # enable peptide Glyco motif model
icat: false # apply ICAT model (default Autodetect ICAT)
instrwarn: false # warn and continue if combined data was generated by different instrument models
leave: false # leave alone deltaCn*, Mascot*, and Comet* results from results (default Penalize * results)
maldi: false # enable MALDI mode
masswidth: 5 # model mass width (default 5)
minpeplen: 7 # minimum peptide length not rejected (default 7)
minpintt: 2 # minimum number of NTT in a peptide used for positive pI model (default 2)
minpiprob: 0.9 # minimum probability after first pass of a peptide used for positive pI model (default 0.9)
minprob: 0.05 # report results with minimum probability (default 0.05)
minrtntt: 2 # minimum number of NTT in a peptide used for positive RT model (default 2)
minrtprob: 0.9 # minimum probability after first pass of a peptide used for positive RT model (default 0.9)
neggamma: false # use Gamma distribution to model the negative hits
noicat: false # do no apply ICAT model (default Autodetect ICAT)
nomass: false # disable mass model
nonmc: false # disable NMC missed cleavage model
nonparam: true # use semi-parametric modeling, must be used in conjunction with --decoy option
nontt: false # disable NTT enzymatic termini model
optimizefval: false # (SpectraST only) optimize f-value function f(dot,delta) using PCA
phospho: false # enable peptide Phospho motif model
pi: false # enable peptide pI model
ppm: true # use PPM mass error instead of Dalton for mass modeling
zero: false # report results with minimum probability 0
PTM Localization: # PTMProphet v6.1
autodirect: false # use direct evidence when the lability is high, use in combination with LABILITY
cions: # use specified C-term ions, separate multiple ions by commas (default: y for CID, z for ETD)
direct: false # use only direct evidence for evaluating PTM site probabilities
em: 2 # set EM models to 0 (no EM), 1 (Intensity EM Model Applied) or 2 (Intensity and Matched Peaks EM Models Applied)
static: false # use static fragppmtol for all PSMs instead of dynamically estimates offsets and tolerances
fragppmtol: 15 # when computing PSM-specific mass_offset and mass_tolerance, use specified default +/- MS2 mz tolerance on fragment ions
ifrags: false # use internal fragments for localization
keepold: false # retain old PTMProphet results in the pepXML file
lability: false # compute Lability of PTMs
massdiffmode: false # use the Mass Difference and localize
excludemassdiffmin: 0 # Minimum mass difference excluded for MASSDIFFFMODE analysis (default=0)
excludemassdiffmax: 0 # Maximun mass difference excluded for MASSDIFFFMODE analysis (default=0)
massoffset: 0 # adjust the massdiff by offset (0 = use default)
maxfragz: 0 # limit maximum fragment charge (default: 0=precursor charge, negative values subtract from precursor charge)
maxthreads: 4 # use specified number of threads for processing
mino: 0 # use specified number of pseudo-counts when computing Oscore (0 = use default)
minprob: 0 # use specified minimum probability to evaluate peptides
mods: # specify modifications
nions: # use specified N-term ions, separate multiple ions by commas (default: a,b for CID, c for ETD)
nominofactor: false # disable MINO factor correction when MINO= is set greater than 0 (default: apply MINO factor correction)
ppmtol: 1 # use specified +/- MS1 ppm tolerance on peptides which may have a slight offset depending on search parameters
verbose: false # produce Warnings to help troubleshoot potential PTM shuffling or mass difference issues
Protein Inference: # ProteinProphet v5.2
accuracy: false # equivalent to --minprob 0
allpeps: false # consider all possible peptides in the database in the confidence model
confem: false # use the EM to compute probability given the confidence
delude: false # do NOT use peptide degeneracy information when assessing proteins
excludezeros: false # exclude zero prob entries
fpkm: false # model protein FPKM values
glyc: false # highlight peptide N-glycosylation motif
icat: false # highlight peptide cysteines
instances: false # use Expected Number of Ion Instances to adjust the peptide probabilities prior to NSP adjustment
iprophet: false # input is from iProphet
logprobs: false # use the log of the probabilities in the Confidence calculations
maxppmdiff: 20 # maximum peptide mass difference in PPM (default 20)
minprob: 0.05 # peptideProphet probabilty threshold (default 0.05)
mufactor: 1 # fudge factor to scale MU calculation (default 1)
nogroupwts: false # check peptide's Protein weight against the threshold (default: check peptide's Protein Group weight against threshold)
nonsp: false # do not use NSP model
nooccam: false # non-conservative maximum protein list
noprotlen: false # do not report protein length
normprotlen: false # normalize NSP using Protein Length
protmw: false # get protein mol weights
softoccam: false # peptide weights are apportioned equally among proteins within each Protein Group (less conservative protein count estimate)
unmapped: false # report results for UNMAPPED proteins
Label-Free Quantification: # Freequant
peakTimeWindow: 0.4 # specify the time windows for the peak (minute) (default 0.4)
retentionTimeWindow: 3 # specify the retention time window for xic (minute) (default 3)
tolerance: 10 # m/z tolerance in ppm (default 10)
raw: false # read raw files instead of converted mzML, or mzXML
faims: false # use FAIMS information for the quantification
Isobaric Quantification: # Labelquant
bestPSM: false # select the best PSMs for protein quantification
level: 2 # ms level for the quantification
minProb: 0.7 # only use PSMs with a minimum probability score
plex: # number of channels
purity: 0.5 # ion purity threshold (default 0.5)
removeLow: 0.0 # ignore the lower 3% PSMs based on their summed abundances
tolerance: 20 # m/z tolerance in ppm (default 20)
uniqueOnly: false # report quantification based on only unique peptides
brand: tmt # isobaric labeling brand (tmt, itraq)
raw: false # read raw files instead of converted mzML, or mzXML
Bio Cluster Quantification: # BioQuant
organismUniProtID: # UniProt proteome ID
level: 0.9 # cluster identity level (default 0.9)
FDR Filtering: # Filter
psmFDR: 0.01 # psm FDR level (default 0.01)
peptideFDR: 0.01 # peptide FDR level (default 0.01)
ionFDR: 0.01 # peptide ion FDR level (default 0.01)
proteinFDR: 0.01 # protein FDR level (default 0.01)
peptideProbability: 0.7 # top peptide probability threshold for the FDR filtering (default 0.7)
proteinProbability: 0.5 # protein probability threshold for the FDR filtering (not used with the razor algorithm) (default 0.5)
peptideWeight: 1 # threshold for defining peptide uniqueness (default 1)
razor: false # use razor peptides for protein FDR scoring
picked: false # apply the picked FDR algorithm before the protein scoring
mapMods: false # map modifications acquired by an open search
models: false # print model distribution
sequential: false # alternative algorithm that estimates FDR using both filtered PSM and Protein lists
Individual Reports: # Report
msstats: false # create an output compatible to MSstats
withDecoys: false # add decoy observations to reports
mzID: false # create a mzID output
prefix: false # add the project (folder) name as a prefix to the output files
Integrated Reports: # Abacus
protein: true # global level protein report
peptide: true # global level peptide report
proteinProbability: 0.9 # minimum protein probability (default 0.9)
peptideProbability: 0.5 # minimum peptide probability (default 0.5)
uniqueOnly: false # report TMT quantification based on only unique peptides
reprint: false # create abacus reports using the Reprint format
Integrated Isobaric Quantification: # TMT-Integrator v4.0.0
path: # path to TMT-Integrator jar
memory: 6 # memory allocation, in Gb
output: # the location of output files
channel_num: 10 # number of channels in the multiplex (e.g. 10, 11)
ref_tag: Bridge # unique tag for identifying the reference channel (Bridge sample added to each multiplex)
groupby: -1 # level of data summarization(0: PSM aggregation to the gene level; 1: protein; 2: peptide sequence; 3: PTM site; -1: generate reports at all levels)
psm_norm: false # perform additional retention time-based normalization at the PSM level
outlier_removal: true # perform outlier removal
prot_norm: -1 # normalization (0: None; 1: MD (median centering); 2: GN (median centering + variance scaling); -1: generate reports with all normalization options)
min_pep_prob: 0.9 # minimum PSM probability threshold (in addition to FDR-based filtering by Philosopher)
min_purity: 0.5 # ion purity score threshold
min_percent: 0.05 # remove low intensity PSMs (e.g. value of 0.05 indicates removal of PSMs with the summed TMT reporter ions intensity in the lowest 5% of all PSMs)
unique_pep: false # allow PSMs with unique peptides only (if true) or unique plus razor peptides (if false), as classified by Philosopher and defined in PSM.tsv files
unique_gene: 0 # additional, gene-level uniqueness filter (0: allow all PSMs; 1: remove PSMs mapping to more than one GENE with evidence of expression in the dataset; 2:remove all PSMs mapping to more than one GENE in the fasta file)
best_psm: true # keep the best PSM only (highest summed TMT intensity) among all redundant PSMs within the same LC-MS run
prot_exclude: none # exclude proteins with specified tags at the beginning of the accession number (e.g. none: no exclusion; sp|,tr| : exclude protein with sp| or tr|)
allow_overlabel: true # allow PSMs with TMT on S (when overlabeling on S was allowed in the database search)
allow_unlabeled: true # allow PSMs without TMT tag or acetylation on the peptide n-terminus
mod_tag: none # PTM info for generation of PTM-specific reports (none: for Global data; S[167],T[181],Y[243]: for Phospho; K[170]: for K-Acetyl)
min_site_prob: -1 # site localization confidence threshold (-1: for Global; 0: as determined by the search engine; above 0 (e.g. 0.75): PTMProphet probability, to be used with phosphorylation only)
ms1_int: true # use MS1 precursor ion intensity (if true) or MS2 summed TMT reporter ion intensity (if false) as part of the reference sample abundance estimation
top3_pep: true # use top 3 most intense peptide ions as part of the reference sample abundance estimation
print_RefInt: false # print individual reference sample abundance estimates for each multiplex in the final reports (in addition to the combined reference sample abundance estimate)
add_Ref: -1 # add an artificial reference channel if there is no reference channel (-1: don't add the reference; 0: use summation as the reference; 1: use average as the reference; 2: use median as the reference)
max_pep_prob_thres: 0 # the threshold for maximum peptide probability
min_ntt: 0 # minimum allowed number of enzymatic termini
aggregation_method: 0 # the aggregation method from the PSM level to the specified level (0: median, 1: weighted-ratio)
log2transformed: true # report ratio and abundance reports in the log2 scale