pyna-colada/pyna-collada at master · parisepigenetics/pyna-colada · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#! /usr/bin/env python

"""Pyna-collada is a tool that draws chromosome contacts of specific pre-specified regions from a whole genome Hi-C .hic file chromosome map. Visualises the resulting contact sub-matrices in interactive html files.
"""

#__version__ = "0.5"

import argparse
import plotly
import plotly.graph_objects as go
import hicstraw

import pyna_collada as pc

# TODO fix the rsolution argument ranges of accepted values from straw
parser = argparse.ArgumentParser(
    prog="pyna_collada",
    description="Tool to visualise Hi-C contacts from an annotated gene list of interest.",
    epilog="Authors: Costas Bouyioukos, 2019-2022, Universite Paris Cite and UMR7216.")
parser.add_argument(
    "hicFile",
    type=str,
    metavar="hic_file",
    help="Filename (or path) of .hic file (NO option for STDIN)")
parser.add_argument(
    "genesAnnot",
    type=str,  #argparse.FileType("r"),
    metavar="gene_annot_coords_file",
    help="Filename (or path) of annotation file (usually from ENSEMBL) containing at least the gene name, gene start and end, chromosome and strand")
parser.add_argument(
    "outfile",
    type=str,
    metavar="out_file",
    help="Filename (or path) of the results .html figure file (NO option for STDOUT)")
parser.add_argument(
    "-b",
    "--bin-size",
    help="Seelction of the bin size of the hi-c map (i.e. resolution). Default=10000",
    type=int,
    default=10000,
    dest="binSize",
    metavar="bin_size")
parser.add_argument(
    "-c",
    "--chromosomes",
    nargs="+",
    type=int,
    default=1,
    help="The chromosome index(es) of which we want to extract contacts. Deafult: 1 (corresponds to the first chromosome in the hic file)",
    metavar="chr_index",
    dest="chr")
parser.add_argument(
    "-d",
    "--downstream-offset",
    type=int,
    default=500,
    help="The number of bps to offset down-stream of TSS for defining the overlaping regions. Default: 500",
    metavar="downstream_off",
    dest="offsetD")
parser.add_argument(
    "-u",
    "--upstream-offset",
    type=int,
    default=1000,
    help="The number of bps to offset up-stream of TSS for defining the overlaping regions. Default: 1000",
    metavar="upstream_off",
    dest="offsetU")
parser.add_argument(
    "-i",
    "--interChromosomal",
    action="store_true",
    dest="inter",
    help="Flag to turn on inter- and intra-chromosomal contacts. Default: intra-chromosomal only")
parser.add_argument(
    "-n",
    "--normalisation",
    nargs="?",
    default="NONE",
    metavar="norm_meth",
    type=str,
    help="Choise of a normalisation method from the Juice-tools or hic-straw (One of VC, VC_SQRT, KR, Default: NONE)",
    dest="norm")
parser.add_argument(
    "-t",
    "--type",
    nargs="?",
    default="BP",
    metavar="Type",
    type=str,
    help="Choise of the hic-straw extracted measure. Default: BP",
    dest="type")
parser.add_argument(
    "-v",
    "--version",
    action="version",
    version=0.5)


# Parse the command line arguments
optArgs = parser.parse_args()

# Sort out the options selection for chromosomes and resolution by interogating the HiC file
hic = hicstraw.HiCFile(optArgs.hicFile)
# Get a list of chromosome names
if isinstance(optArgs.chr, int):
    chromNames =  [hic.getChromosomes()[optArgs.chr].name]
else:
    chromNames = [hic.getChromosomes()[i].name for i in optArgs.chr]
# Check resolution consistency
resolutions = hic.getResolutions()
if optArgs.binSize not in resolutions:
    raise ValueError(f"Selected bin size {optArgs.binSize} not included in supported hic file resolutions {resolutions}")

# Parse annotation file
ga = pc.parse_annotation(optArgs)

# Calculate the interaction intervals
gaExp = pc.expand_gene_annotation_bins(ga, optArgs)

# Extract the contacts data from HiC file
contacts = {}
for i in range(len(chromNames)):
    chrA = chromNames[i]
    contacts[chrA] = {}
    if optArgs.inter:  # Get also the inter-chromosomal contacts
        for j in range(i, len(chromNames)):
            chrB = chromNames[j]
            contacts[chrA][chrB] = pc.get_contacts_frame(optArgs, chrA, chrB)
    else:
        chrB = chrA
        contacts[chrA][chrB] = pc.get_contacts_frame(optArgs, chrA, chrB)
# TODO the above loops will need to be parallelised for future versions

# Main function to populate the data frame
geneIntContacts = pc.populate_contacts_ofInterest(contacts, gaExp)

# Ploting with plotly
fig = go.Figure()
t1 = pc.matrx_trace(geneIntContacts)
fig.add_trace(t1)  # Deals only for one matrix of interest for the moment
# TODO in later versions
fig.update_layout(width=1600, height=1600,
                  title=f"Selected contacts from chromosome(s): {chromNames}",
                  xaxis_title="Gene name - Coordinates", yaxis_title="Gene name - coordinates",
                  legend_title="Contacts",
                  font=dict(family="Droid Sans", size=14)
                  )
fig.update_yaxes(automargin=True)
fig.update_xaxes(automargin=True)
plotly.offline.plot(fig, filename=optArgs.outfile, auto_open=False)
# TODO TODO! deal properly with multiple chromosomes and report more than one outfiles.