Skip to content

Commit

Permalink
Merge pull request #747 from deeptools/aggregate_average_strand_infor…
Browse files Browse the repository at this point in the history
…mation

Aggregate average strand information
  • Loading branch information
joachimwolff authored Aug 9, 2021
2 parents ed804ad + 861ab0f commit c7135ff
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 99 deletions.
152 changes: 56 additions & 96 deletions hicexplorer/hicAggregateContacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,11 @@ def parse_arguments(args=None):
'intra-chromosomal contacts are of interest.',
action='store_true',
required=False)

parserOpt.add_argument('--considerStrandDirection',
help='This parameter specifies if the strand information is taken into account for the aggregation. '
'It has the effect that the contacts of a reverse strand region are inverted e.g. [1,2,3] becomes [3,2,1].',
action='store_true',
required=False)
parserOpt.add_argument('--largeRegionsOperation',
help='If a given coordinate in the bed file is larger than '
'a bin of the input matrix, by default only the first bin '
Expand Down Expand Up @@ -232,7 +236,7 @@ def parse_arguments(args=None):
return parser


def read_bed_per_chrom(fh, chrom_list):
def read_bed_per_chrom(fh, chrom_list, pConsiderStrandDirection=None):
"""
Reads the given BED file returning
a dictionary that contains, per each chromosome
Expand All @@ -243,20 +247,28 @@ def read_bed_per_chrom(fh, chrom_list):
if line[0] == "#":
continue
fields = line.strip().split()
if pConsiderStrandDirection and len(fields) < 6:
log.error('Strand information should be considered, but BED file has not at least six columns. Exiting!')
exit(1)
# log.debug('fields {}'.format(fields))
if fields[0] not in chrom_list:
if change_chrom_names(fields[0]) in chrom_list:
fields[0] = change_chrom_names(fields[0])
else:
continue
if fields[0] not in interval:
interval[fields[0]] = []
interval[fields[0]].append((int(fields[1]), int(fields[2])))

if pConsiderStrandDirection:
interval[fields[0]].append((int(fields[1]), int(fields[2]), str(fields[5])))
else:
interval[fields[0]].append((int(fields[1]), int(fields[2])))
return interval


def aggregate_contacts(bed1, bed2, agg_info, ma, M_half, largeRegionsOperation, range=None, transform=None, mode=''):
def aggregate_contacts(bed1, bed2, agg_info, ma, M_half, largeRegionsOperation, range=None, transform=None, mode='', pConsiderStrandDirection=None):
"""
To aggregate the contacts of desired sumatrices.
To aggregate the contacts of desired submatrices.
"""
seen_chrs = []
for k1, v1 in bed1.items():
Expand All @@ -272,7 +284,11 @@ def aggregate_contacts(bed1, bed2, agg_info, ma, M_half, largeRegionsOperation,
for coord2 in v2:
if (k1 == k2) and (coord1 == coord2):
continue
interval = [(k1, coord1[0], coord1[1]), (k2, coord2[0], coord2[1])]
if pConsiderStrandDirection:
interval = [(k1, coord1[0], coord1[1], coord1[2]), (k2, coord2[0], coord2[1], coord2[2])]
else:
interval = [(k1, coord1[0], coord1[1]), (k2, coord2[0], coord2[1])]

count_contacts(interval, ma, M_half, mode, agg_info, largeRegionsOperation, seen_chrs, range, transform)
to_del = []
for k1, v1 in agg_info["agg_matrix"].items():
Expand All @@ -289,7 +305,7 @@ def aggregate_contacts(bed1, bed2, agg_info, ma, M_half, largeRegionsOperation,
agg_info["agg_contact_position"] = {key: val for key, val in agg_info["agg_contact_position"].items() if key not in to_del}


def aggregate_contacts_per_row(bed1, bed2, agg_info, ma, chrom_list, M_half, largeRegionsOperation, range=None, transform=None, mode='', perChr=False):
def aggregate_contacts_per_row(bed1, bed2, agg_info, ma, chrom_list, M_half, largeRegionsOperation, range=None, transform=None, mode='', perChr=False, pConsiderStrandDirection=None):
"""
To aggregate the contacts of the desired submatrices , if row-wise.
"""
Expand All @@ -311,7 +327,11 @@ def aggregate_contacts_per_row(bed1, bed2, agg_info, ma, chrom_list, M_half, lar
elif mode == 'intra-chr': # skip the lines with different chrom
if line1[0] != line2[0]:
continue
interval = [(line1[0], line1[1], line1[2]), (line2[0], line2[1], line2[2])]
if pConsiderStrandDirection:
interval = [(line1[0], line1[1], line1[2], line1[5]), (line2[0], line2[1], line2[2], line2[5])]
else:
interval = [(line1[0], line1[1], line1[2]), (line2[0], line2[1], line2[2])]

count_contacts(interval, ma, M_half, mode, agg_info, largeRegionsOperation, seen_chrs, range, transform)
to_del = []
for k1, v1 in agg_info["agg_matrix"].items():
Expand All @@ -329,9 +349,16 @@ def count_contacts(interval, ma, M_half, mode, agg_info, largeRegionsOperation,
"""
To count the number of contacts for a given pair of intervals
"""
log.debug('interval {}'.format(interval))

chrom1, start1, end1 = interval[0]
chrom2, start2, end2 = interval[1]
orientation1 = None
orientation2 = None
if len(interval[0]) == 3 and len(interval[1]) == 3:
chrom1, start1, end1 = interval[0]
chrom2, start2, end2 = interval[1]
elif len(interval[0]) == 4 and len(interval[1]) == 4:
chrom1, start1, end1, orientation1 = interval[0]
chrom2, start2, end2, orientation2 = interval[1]

if chrom1 not in agg_info["chrom_coord"]:
return
Expand Down Expand Up @@ -399,6 +426,19 @@ def count_contacts(interval, ma, M_half, mode, agg_info, largeRegionsOperation,
return
try:
mat_to_append = ma.matrix[bin_id1 - M_half:bin_id1 + M_half + 1, :][:, bin_id2 - M_half:bin_id2 + M_half + 1].todense().astype(float)

if (orientation1 is None and orientation2 is None) or (orientation1 == '+' and orientation2 == '+'):
pass
elif orientation1 == '+' and orientation2 == '-':
# flip values concerning the y axis
mat_to_append = np.fliplr(mat_to_append)
elif orientation1 == '-' and orientation2 == '+':
# flip values concerning the x axis
mat_to_append = np.flipud(mat_to_append)
elif orientation1 == '-' and orientation2 == '-':
# flip values concerning the x and y axis
mat_to_append = mat_to_append.T

except IndexError:
log.info("index error for {} {}".format(bin_id1, bin_id2))
return
Expand Down Expand Up @@ -698,88 +738,6 @@ def plot_aggregated_contacts(clustered_info, num_clusters, M_half, args):

plt.close()

# def plot_aggregated_contacts(clustered_info, num_clusters, M_half, args):
#
# fig = plt.figure(figsize=(5.5 , 5.5 * num_clusters + 0.5))
# gs = gridspec.GridSpec(num_clusters+1, 1,
# width_ratios=[10],
# height_ratios=[10] * num_clusters + [0.6])
#
# gs.update(wspace=0.01, hspace=0.2)
# vmin, vmax = (args.vMin, args.vMax)
# cmap = cm.get_cmap(args.colorMap)
# log.debug("vmax: {}, vmin: {}".format(vmax, vmin))
# chrom_avg = []
# for cluster_number, cluster_indices in enumerate(clustered_info["clustered_dict"]):
# # compute median values
# submatrices = np.array([clustered_info["submatrices"][x] for x in cluster_indices])
# chrom_avg.append(compute_avg(submatrices, args.operationType))
# log.info("Mean aggregate matrix values: {}".format(chrom_avg[cluster_number].mean()))
# log.info("total pairs considered on cluster_{}: "
# "{}".format(cluster_number + 1, len(cluster_indices)))
#
# if chrom_avg[cluster_number].shape[0] == 0:
# log.debug("matrix for contacts on cluster_{} is empty".format(cluster_number + 1))
# continue
# title = "cluster_{}".format(cluster_number + 1)
# if args.plotType == '2d':
# ax = plt.subplot(gs[cluster_number, 0])
# ax.set_title(title)
# img = ax.imshow(chrom_avg[cluster_number], aspect='equal',
# interpolation='nearest', vmax=vmax, vmin=vmin,
# cmap=cmap,
# extent=[-M_half, M_half + 1, -M_half, M_half + 1])
#
# else:
# # Axes3D is required for projection='3d' to work
# # but since is imported but not used, flake8 will complain
# # thus I add this dummy variable to avoid the error
# Axes3D(fig)
# ax = plt.subplot(gs[cluster_number, 1], projection='3d')
# # ax.set_aspect('equal')
# ax.margins(0)
# X, Y = np.meshgrid(range(-M_half, M_half + 1),
# range(-M_half, M_half + 1))
# Z = chrom_avg[chrom][cluster_number].copy()
# img = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, linewidth=0, cmap=cmap,
# vmax=vmax, vmin=vmin, edgecolor='none')
#
# ax.set_zticklabels([])
# if vmax is not None and vmax is not None:
# ax.set_zlim(vmin, vmax)
#
# if args.outFilePrefixMatrix:
# # save aggregate matrix values
# if num_clusters == 1:
# output_matrix_name = "{file}_genome.tab".format(file=args.outFilePrefixMatrix)
# else:
# output_matrix_name = "{file}_genome_cluster_{id}.tab".format(file=args.outFilePrefixMatrix,
# id=cluster_number + 1)
# np.savetxt(output_matrix_name, chrom_avg[cluster_number], '%0.5f', delimiter='\t')
#
# if args.outFileContactPairs:
# center_values_to_order = np.array(clustered_info["centers"])[cluster_indices]
# center_values_order = np.argsort(center_values_to_order)[::-1]
#
# output_name = "{file}_cluster_{id}.tab".format(file=args.outFileContactPairs,
# id=cluster_number + 1)
# with open(output_name, 'w') as fh:
# for cl_idx in center_values_order:
# value = center_values_to_order[cl_idx]
# chrom1, start1, end1, chrom2, start2, end2 = clustered_info["coords"][cl_idx]
# fh.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(chrom1, start1, end1, chrom2, start2, end2, value))
#
#
# cbar_x = plt.subplot(gs[-1, 0])
# fig.colorbar(img, cax=cbar_x, orientation='horizontal')
#
# if args.disable_bbox_tight:
# plt.savefig(args.outFileName.name, dpi=args.dpi)
# else:
# plt.savefig(args.outFileName.name, dpi=args.dpi, bbox_inches='tight')
#
# plt.close()


def plot_diagnostic_heatmaps(clustered_info, M_half, args):
num_chromosomes = len(clustered_info.keys())
Expand Down Expand Up @@ -929,6 +887,8 @@ def main(args=None):
agg_info["counter"] = 0
agg_info["used_counter"] = 0
agg_info["empty_mat"] = 0

log.debug('agg_info["agg_matrix"] {}'.format(agg_info["agg_matrix"]))
if (args.mode == 'inter-chr') and (len(agg_info["chrom_coord"]) == 1):
exit("Error: 'inter-chr' mode can not be applied on matrices of only one chromosme.")
if (args.mode == 'inter-chr') and (args.perChr):
Expand All @@ -949,18 +909,18 @@ def main(args=None):
# agg_matrix could be either per chromosome or genome wide
aggregate_contacts_per_row(bed_intervals, bed_intervals2, agg_info, ma, chrom_list,
M_half, args.largeRegionsOperation, args.range,
args.transform, mode=args.mode, perChr=args.perChr)
args.transform, mode=args.mode, perChr=args.perChr, pConsiderStrandDirection=args.considerStrandDirection)
else: # not row-wise
# read and sort bed files.
bed_intervals = read_bed_per_chrom(args.BED, chrom_list)
bed_intervals = read_bed_per_chrom(args.BED, chrom_list, args.considerStrandDirection)
if args.BED2:
bed_intervals2 = read_bed_per_chrom(args.BED2, chrom_list)
bed_intervals2 = read_bed_per_chrom(args.BED2, chrom_list, args.considerStrandDirection)
else:
bed_intervals2 = bed_intervals
# agg_matrix could be either per chromosome or genome wide
aggregate_contacts(bed_intervals, bed_intervals2, agg_info, ma, M_half,
args.largeRegionsOperation, args.range, args.transform,
mode=args.mode)
mode=args.mode, pConsiderStrandDirection=args.considerStrandDirection)
if len(agg_info["agg_matrix"]) == 0:
exit("No susbmatrix found to be aggregated.")

Expand Down
30 changes: 27 additions & 3 deletions hicexplorer/hicAverageRegions.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ def parse_arguments(args=None):
' (Default: %(default)s).',
choices=['start', 'center', 'end'],
default='start')
parserOpt.add_argument('--considerStrandDirection',
help='This parameter specifies if the strand information is taken into account for the aggregation. '
'It has the effect that the contacts of a reverse strand region are inverted e.g. [1,2,3] becomes [3,2,1].',
action='store_true',
required=False)
parserOpt.add_argument('--version', action='version',
version='%(prog)s {}'.format(__version__))

Expand Down Expand Up @@ -130,6 +135,7 @@ def main(args=None):

hic_ma = hm.hiCMatrix(pMatrixFile=args.matrix)
indices_values = []

with open(args.regions, 'r') as file:
for line in file.readlines():
_line = line.strip().split('\t')
Expand All @@ -141,13 +147,21 @@ def main(args=None):
viewpoint = (chrom, start, start)
elif len(_line) >= 3:
chrom, start, end = _line[0], _line[1], _line[2]
if args.considerStrandDirection and len(_line) < 6:
log.error('Strand orientation should be considered but file does not contain the 6th column of the bed file containing this information. Exiting!')
exit(1)

viewpoint = (chrom, start, end)
if args.range:
start_range_genomic, end_range_genomic, start_out, end_out = calculateViewpointRange(hic_ma, viewpoint, args.range, args.coordinatesToBinMapping)
start_bin, end_bin = getBinIndices(hic_ma, (chrom, start_range_genomic, end_range_genomic))
else:
start_bin, end_bin, start_out, end_out = calculateViewpointRangeBins(hic_ma, viewpoint, args.rangeInBins, args.coordinatesToBinMapping)
indices_values.append([start_bin, end_bin, start_out, end_out])
if args.considerStrandDirection:
indices_values.append([start_bin, end_bin, start_out, end_out, _line[5]])

else:
indices_values.append([start_bin, end_bin, start_out, end_out, None])

if args.range:
dimensions_new_matrix = (args.range[0] // hic_ma.getBinSize()) + (args.range[1] // hic_ma.getBinSize())
Expand All @@ -158,7 +172,7 @@ def main(args=None):
count_matrix = np.zeros(shape=(dimensions_new_matrix, dimensions_new_matrix))

# max_length = hic_ma.matrix.shape[1]
for start, end, start_out, end_out in indices_values:
for start, end, start_out, end_out, orientation in indices_values:
_start = 0
_end = summed_matrix.shape[1]
# if start < 0:
Expand All @@ -172,8 +186,18 @@ def main(args=None):
_start = _end - orig_matrix_length
if end_out:
_end = start + orig_matrix_length
submatrix = hic_ma.matrix[start:end, start:end]
if summed_matrix.shape != submatrix.shape:
log.warning('Shape of a submatrix does not match. It is ignored.')
log.warning('Region: {}'.format(hic_ma.getBinPos(start)))
continue
count_matrix[_start:_end, _start:_end] += 1
summed_matrix[_start:_end, _start:_end] += hic_ma.matrix[start:end, start:end]

if orientation is None or orientation == '+':
summed_matrix[_start:_end, _start:_end] += hic_ma.matrix[start:end, start:end]
elif orientation == '-':

summed_matrix[_start:_end, _start:_end] += hic_ma.matrix[start:end, start:end].T
summed_matrix /= count_matrix
summed_matrix = np.array(summed_matrix)
data = summed_matrix[np.nonzero(summed_matrix)]
Expand Down

0 comments on commit c7135ff

Please sign in to comment.