BiG-SCAPE/bigscape.py at dev · adraismawur/BiG-SCAPE · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
#!/usr/bin/env python3


"""
BiG-SCAPE

PI: Marnix Medema               marnix.medema@wur.nl

Maintainers/developers:
Jorge Navarro                   j.navarro@westerdijkinstitute.nl
Satria Kautsar                  satria.kautsar@wur.nl
Arjan Draisma                   arjan.draisma@wur.nl

Developer:
Emmanuel (Emzo) de los Santos   E.De-Los-Santos@warwick.ac.uk


Usage:   Please see `python bigscape.py -h`

Example: python bigscape.py -c 8 --pfam_dir ./ -i ./inputfiles -o ./results

Status: beta

Official repository:
https://git.wageningenur.nl/medema-group/BiG-SCAPE


# License: GNU Affero General Public License v3 or later
# A copy of GNU AGPL v3 should have been included in this software package in LICENSE.txt.
"""

# Makes sure the script can be used with Python 2 as well as Python 3.
from __future__ import print_function
from __future__ import division


import os
import logging
import warnings
import time
import pickle

from distutils import dir_util

from sys import version_info

# refactored imports
from src import big_scape
from src import gbk
from src import hmm
from src import mibig
from src import pfam
from src import utility
from src import data

def init_logger(options, log_timestamp):
    """Initializes the logger for big-scape

    input:
        root_path - root path of this bigscape.py file
        options - options object containing command line arument information
    """

    ## logging
    log_formatter = logging.Formatter("%(asctime)s %(levelname)-7.7s %(message)s")
    root_logger = logging.getLogger()

    # check options override
    if options.log_path is None:
        log_path = options.outputdir
    else:
        log_path = options.log_path

    # create log dir
    if not os.path.exists(log_path):
        os.mkdir(log_path)
    # set log file
    log_file = os.path.join(log_path, log_timestamp + ".log")

    file_handler = logging.FileHandler(log_file)
    file_handler.setFormatter(log_formatter)
    root_logger.addHandler(file_handler)

    if not options.quiet:
        console_handler = logging.StreamHandler()
        console_handler.setFormatter(log_formatter)
        root_logger.addHandler(console_handler)

    if options.verbose:
        root_logger.level = logging.DEBUG
    else:
        root_logger.level = logging.INFO

if __name__ == "__main__":
    ## Initialization steps
    # show an error if we're not on python 3
    if version_info.major != 3:
        logging.error("This python program was developed for Python 3. Please make sure to use \
            python 3 when running BiG-SCAPE")
        exit(1)

    # get root path of this project
    ROOT_PATH = os.path.realpath(os.path.dirname(__file__))

    # get run options
    # ROOT_PATH is passed here because the imports no longer allow us to use __file__
    OPTIONS = utility.cmd_parser(ROOT_PATH)


    # we want to keep the same timestamp for both the output log
    # as well as the profile log, so define it once here
    LOG_TIMESTAMP = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())

    # init logger
    init_logger(OPTIONS, LOG_TIMESTAMP)

    # initialize the profiler & start it
    PROFILER = utility.Profiler(OPTIONS, LOG_TIMESTAMP)
    PROFILER.start()

    # ignore specific warnings from sklearn
    # TODO: investigate whether this can be mediated by changing the affinity propagation.
    # concerns the following warning:
    # UserWarning: All samples have mutually equal similarities. Returning arbitrary cluster
    # center(s).
    warnings.filterwarnings(action="ignore", category=UserWarning)

    # TODO: download whatever version of pfam is necessary if it isn't there

    # TODO: next press the hmm file into more optimal formats
    # this currently does not work since it generates hmm files that don't work with the
    # optimized profiles used later on. User still needs to generate their own pressed hmms
    # hmm.pyhmmer_hmmpress(OPTIONS)

    # create new run details
    # ideally we parse all the options and remember them in this object
    # we will later use options to describe what parameters were used, and
    # use the run object to describe what effect that had, e.g. which files
    # were used
    RUN = big_scape.Run()

    # this should be the only occasion where options is needed. no further than this.
    RUN.init(OPTIONS)

    logging.info("Options set to use %d cores", RUN.options.cores)

    # preparation is done. run timing formally starts here
    RUN.start()

    # init database
    DB_PATH = os.path.join(RUN.directories.output, "data.db")
    DB = data.Database(DB_PATH, True)

    ### Step 1: Get all the input files. Write extract sequence and write fasta if necessary
    logging.info("   - - Processing input files - -")

    if RUN.mibig.use_mibig:
        mibig.extract_mibig(RUN)

    # load the input data into the database
    data.initialize_db(RUN, DB)

    # base name as a list and as a set
    # also used in all vs all analysis
    BGC_IDS = data.get_cluster_id_list(DB)
    logging.info("Working with %d total bgcs", len(BGC_IDS))

    ### Step 2: Run hmmscan
    # find which sequences have already had their domains predicted
    PREDICTED_IDS = data.get_predicted_bgc_list(DB)

    # find the difference
    IDS_TODO = list(set(BGC_IDS) - set(PREDICTED_IDS))
    # IDS_TODO = list()

    # run hmmscan if there are any sequences which have not yet been processed by hmmscan
    if len(IDS_TODO) > 0:
        logging.info("Predicting %d domains using hmmsearch", len(IDS_TODO))
        # this function blocks the main thread until finished
        hmm.run_pyhmmer_pfam(RUN, DB, IDS_TODO)
        logging.info(" Finished predicting domains")
    else:
        logging.info(" All files were processed by hmmscan. Skipping step...")

    # dumping db here since this is the largest task until this point.
    DB.dump_db_file()

    if RUN.options.feature_filter:
        logging.info(" Generating features")

        features = data.Features.extract(data.get_cluster_id_list(DB), DB)

        for feature in features:
            feature.save(DB)

        DB.commit_inserts()

    # get a list of high scoring protein hits
    HSPS = data.get_hsp_id_list(DB)

    # get a list of already-aligned hsps
    # TODO: work this into a status table like with bgcs
    # currently this just checks if there is an entry for this HSP in the hsp_alignment table
    # TODO: identify changes in dataset and realign
    ALIGNED_HSPS = data.get_aligned_hsp_list(DB)

    # get the alignments to be done
    HSPS_TODO = list(set(HSPS) - set(ALIGNED_HSPS))
    # HSPS_TODO = list()

    # if there are any to be done, we'll align
    if len(HSPS_TODO) > 0:
        logging.info("Performing multiple alignments using hmmalign")
        hmm.do_multiple_align(RUN, DB, HSPS)
    else:
        logging.info(" All high scoring protein domains were already aligned. Skipping step...")


    DB.dump_db_file()

    ### Step 4: Create SVG figures
    logging.info(" Creating arrower-like figures for each BGC")

    # read hmm file. We'll need that info anyway for final visualization
    logging.info("  Parsing hmm file for domain information")
    PFAM_INFO = pfam.parse_pfam_a(RUN)
    logging.info("    Done")

    # verify if there are figures already generated

    big_scape.generate_images(RUN, DB, PFAM_INFO)
    logging.info(" Finished creating figures")
    logging.info("   - - Calculating distance matrix - -")

    # copy html templates
    HTML_TEMPLATE_PATH = os.path.join(ROOT_PATH, "html_template", "output")
    dir_util.copy_tree(HTML_TEMPLATE_PATH, RUN.directories.output)

    # make a new run folder in the html output & copy the overview_html
    big_scape.copy_template_per_cutoff(RUN, ROOT_PATH)
    RUNDATA_NETWORKS_PER_RUN = big_scape.prepare_cutoff_rundata_networks(RUN)
    HTML_SUBS_PER_RUN = big_scape.prepare_html_subs_per_run(RUN)

    # TODO: refactor ArrowerSVG such that it can use the database so we don't have
    # to generate the PFS files. currently arrowersvg requires these files to work
    data.generate_pfd_files(RUN, DB)

    # create pfams.js
    pfam.create_pfam_js(RUN, PFAM_INFO)

    # Try to make default analysis using all files found inside the input folder
    logging.info("Generating distance network files with ALL available input files")

    # This version contains info on all bgcs with valid classes
    logging.info("   Writing the complete Annotations file for the complete set")
    # big_scape.write_network_annotation_file(RUN, BGC_COLLECTION)

    # TODO: rework data storage from this point onwards. Now we're converting
    # back to large amounts of data in memory because refactoring the storage
    # after this point is a massive task
    BGC_DATA_DICT, GBK_FILE_DICT, MIBIG_SET = gbk.import_gbks(RUN)
    BGC_COLLECTION = data.generate_bgc_collection(RUN, DB, BGC_DATA_DICT, GBK_FILE_DICT)
    ALIGNED_DOMAIN_SEQS = data.generate_aligned_domain_seqs(RUN, DB)
    MIBIG_SET_INDICES = data.generate_mibig_set_indices(RUN, BGC_COLLECTION, MIBIG_SET)

    # Making network files mixing all classes
    if RUN.options.mix:
        big_scape.generate_network(RUN, DB, BGC_COLLECTION, ALIGNED_DOMAIN_SEQS,
                                   MIBIG_SET_INDICES, MIBIG_SET, RUNDATA_NETWORKS_PER_RUN,
                                   HTML_SUBS_PER_RUN, True)

    # Making network files separating by BGC class
    if not RUN.options.no_classify:
        big_scape.generate_network(RUN, DB, BGC_COLLECTION, ALIGNED_DOMAIN_SEQS,
                                   MIBIG_SET_INDICES, MIBIG_SET, RUNDATA_NETWORKS_PER_RUN,
                                   HTML_SUBS_PER_RUN, False)

    # fetch genome list for overview.js
    INPUT_CLUSTERS_IDX = []
    BGC_DATA_DICT = data.gen_bgc_info_for_fetch_genome(DB)
    GBK_FILE_DICT = data.get_cluster_gbk_dict(RUN, DB)
    big_scape.fetch_genome_list(RUN, INPUT_CLUSTERS_IDX, BGC_COLLECTION.bgc_name_tuple, MIBIG_SET,
                                BGC_DATA_DICT, GBK_FILE_DICT)

    # update family data (convert global bgc indexes into input-only indexes)
    big_scape.update_family_data(RUNDATA_NETWORKS_PER_RUN, INPUT_CLUSTERS_IDX,
                                 BGC_COLLECTION.bgc_name_tuple, MIBIG_SET)

    # generate overview data
    RUN.end()
    big_scape.generate_results_per_cutoff_value(RUN, RUNDATA_NETWORKS_PER_RUN, HTML_SUBS_PER_RUN)

    # done
    RUN.report_runtime()

    PROFILER.stop()