Skip to content

Commit

Permalink
Implementing for hicBuildMatrix the restriction cut file limitations …
Browse files Browse the repository at this point in the history
…to the region which is defined. #646
  • Loading branch information
joachimwolff committed Mar 3, 2021
1 parent f28567a commit 5685162
Show file tree
Hide file tree
Showing 15 changed files with 320 additions and 3 deletions.
15 changes: 12 additions & 3 deletions hicexplorer/hicBuildMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,7 @@ def get_bins(bin_size, chrom_size, region=None):
return bin_intvals


def bed2interval_list(bed_file_handler):
def bed2interval_list(bed_file_handler, pChromosomeSize, pRegion):
r"""
reads a BED file and returns
a list of tuples containing
Expand All @@ -387,17 +387,26 @@ def bed2interval_list(bed_file_handler):
[('chr1', 10, 20), ('chr1', 60, 70)]
>>> os.remove(_file.name)
"""

if pRegion is not None:
chrom_size, start_region, end_region, _ = \
getUserRegion(pChromosomeSize, pRegion)

count = 0
interval_list = []
for line in bed_file_handler:
count += 1
fields = line.strip().split()
try:
chrom, start, end = fields[0], int(fields[1]), int(fields[2])

except IndexError:
log.error("error reading BED file at line {}".format(count))

interval_list.append((chrom, start, end))
if chrom == chrom_size[0] and start_region <= start and end_region <= end:

interval_list.append((chrom, start, end))

return interval_list


Expand Down Expand Up @@ -1153,7 +1162,7 @@ def main(args=None):

rf_interval = []
for restrictionCutFile in args.restrictionCutFile:
rf_interval.extend(bed2interval_list(restrictionCutFile))
rf_interval.extend(bed2interval_list(restrictionCutFile, chrom_sizes, args.region))

rf_positions = intervalListToIntervalTree(rf_interval)
log.debug('rf_positions {}'.format(rf_positions.keys()))
Expand Down
32 changes: 32 additions & 0 deletions hicexplorer/test/general/test_hicBuildMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,38 @@ def test_build_matrix_restriction_enzyme(capsys):
# os.unlink("/tmp/test.bam")


def test_build_matrix_restriction_enzyme_region(capsys):
outfile = NamedTemporaryFile(suffix='.h5', delete=False)
outfile.close()
outfile_bam = NamedTemporaryFile(suffix='.bam', delete=False)
outfile.close()
qc_folder = mkdtemp(prefix="testQC_")
args = "-s {} {} --outFileName {} -bs 5000 -b {} --QCfolder {} --threads 4 --danglingSequence GATC AGCT --restrictionSequence GATC AAGCTT -rs {} {} --region {}".format(sam_R1, sam_R2,
outfile.name, outfile_bam.name,
qc_folder, dpnii_file, ROOT + 'hicFindRestSite/hindIII.bed', 'chr3R').split()
# hicBuildMatrix.main(args)
compute(hicBuildMatrix.main, args, 5)

test = hm.hiCMatrix(ROOT + "small_test_matrix_parallel_two_rc_chr3R.h5")
new = hm.hiCMatrix(outfile.name)

# print('test.cut_intervals {}'.format(test.cut_intervals))
# print('new.cut_intervals {}'.format(new.cut_intervals))
nt.assert_equal(test.matrix.data, new.matrix.data)
nt.assert_equal(len(test.cut_intervals), len(new.cut_intervals))

# print("MATRIX NAME:", outfile.name)
print(set(os.listdir(ROOT + "QC_region/")))
assert are_files_equal(ROOT + "QC_region/QC.log", qc_folder + "/QC.log")
assert set(os.listdir(ROOT + "QC_region/")) == set(os.listdir(qc_folder))

# accept delta of 60 kb, file size is around 4.5 MB
assert abs(os.path.getsize(ROOT + "build_region.bam") - os.path.getsize(outfile_bam.name)) < 64000

os.unlink(outfile.name)
shutil.rmtree(qc_folder)


def test_build_matrix_chromosome_sizes(capsys):
outfile = NamedTemporaryFile(suffix='.h5', delete=False)
outfile.close()
Expand Down
30 changes: 30 additions & 0 deletions hicexplorer/test/test_data/QC_region/QC.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@

File /tmp/tmpmzka3uhk.h5
Sequenced reads 99983
Min rest. site distance 300
Max library insert size 1000

# count (percentage w.r.t. total sequenced reads)
Pairs mappable, unique and high quality 52726 (52.73)
Hi-C contacts 7987 (7.99)
One mate unmapped 8777 (8.78)
One mate not unique 3603 (3.60)
Low mapping quality 34877 (34.88)

# count (percentage w.r.t. mappable, unique and high quality pairs)
dangling end GATC 11 (0.02)
dangling end AGCT 59 (0.11)
self ligation (removed) 0 (0.00)
One mate not close to rest site 40906 (77.58)
same fragment 3751 (7.11)
self circle 0 (0.00)
duplicated pairs 12 (0.02)

# count (percentage w.r.t. total valid pairs used)
inter chromosomal 0 (0.00)
Intra short range (< 20kb) 2145 (26.86)
Intra long range (>= 20kb) 5842 (73.14)
Read pair type: inward pairs 1806 (22.61)
Read pair type: outward pairs 2447 (30.64)
Read pair type: left pairs 1865 (23.35)
Read pair type: right pairs 1869 (23.40)
2 changes: 2 additions & 0 deletions hicexplorer/test/test_data/QC_region/QC_table.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
File Sequenced reads Min rest. site distance Max library insert size Pairs mappable, unique and high quality Hi-C contacts One mate unmapped One mate not unique Low mapping quality dangling end GATC dangling end AGCT self ligation (removed) One mate not close to rest site same fragment self circle duplicated pairs inter chromosomal Intra short range (< 20kb) Intra long range (>= 20kb) Read pair type: inward pairs Read pair type: outward pairs Read pair type: left pairs Read pair type: right pairs
/tmp/tmpmzka3uhk.h5 99983 300 1000 52726 7987 8777 3603 34877 11 59 0 40906 3751 0 12 0 2145 5842 1806 2447 1865 1869
2 changes: 2 additions & 0 deletions hicexplorer/test/test_data/QC_region/discarded_table.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
File One mate not close to rest site One mate not close to rest site % dangling end GATC % dangling end AGCT % duplicated pairs duplicated pairs % same fragment same fragment % self circle self circle % self ligation (removed) self ligation (removed) %
/tmp/tmpmzka3uhk.h5 40906 0.7758221750180176 0.0002086257254485453 0.001118992527405834 12 0.00022759170048932214 3751 0.07114137237795395 0 0.0 0 0.0
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 2 additions & 0 deletions hicexplorer/test/test_data/QC_region/distance_table.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
File inter chromosomal inter chromosomal % Intra short range (< 20kb) Intra short range (< 20kb) % Intra long range (>= 20kb) Intra long range (>= 20kb) %
/tmp/tmpmzka3uhk.h5 0 0.0 2145 0.2685614122949793 5842 0.7314385877050207
Loading

0 comments on commit 5685162

Please sign in to comment.