forked from TheJacksonLaboratory/diachrscripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpooler.py
executable file
·101 lines (76 loc) · 3.69 KB
/
pooler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python
"""
This script pools the four read pair counts of interactions from different files found at a given path.
The usage of pooler is demonstrated in the following Jupyter notebook:
diachrscripts/jupyter_notebooks/usage/usage_of_pooler.ipynb
"""
import argparse
import os
from diachr.diachromatic_interaction_set import DiachromaticInteractionSet
# Parse command line
####################
parser = argparse.ArgumentParser(description='Combine interactions that occur in a specified number of replicates.')
parser.add_argument('-o', '--out-prefix', help='Prefix for output.', default='OUT_PREFIX')
parser.add_argument('-i', '--interaction-files-path', help='Path to directory with Diachromatic interaction files. '
'The interaction files may or may not be gzipped.',
required=True)
parser.add_argument('-r', '--required-replicates', help='Required number of replicates.', required=False, default=2)
args = parser.parse_args()
out_prefix = args.out_prefix
interaction_files_path = args.interaction_files_path
required_replicates = int(args.required_replicates)
parameter_info = "[INFO] " + "Input parameters" + '\n'
parameter_info += "\t[INFO] --out-prefix: " + out_prefix + '\n'
parameter_info += "\t[INFO] --interaction-files-path: " + interaction_files_path + '\n'
parameter_info += "\t[INFO] --required-replicates: " + str(required_replicates) + '\n'
print(parameter_info)
# Get list of interaction files under given path
################################################
def get_gzip_tsv_files(path):
"""
Get list of all gzip files in a directory
"""
gz_files = []
for file in os.listdir(path):
if file.endswith(".tsv.gz"):
gz_path = os.path.join(path, file)
gz_files.append(gz_path)
return gz_files
gz_files = get_gzip_tsv_files(interaction_files_path)
if len(gz_files) < int(required_replicates):
print("[FATAL] Not enough replicates. Must be at least " + str(required_replicates) + " But there are only " + str(
len(gz_files)) + " files.")
exit(1)
# Perform analysis
##################
# Read interaction files
interaction_set = DiachromaticInteractionSet()
for gz_file in gz_files:
interaction_set.parse_file(i_file=gz_file, verbose=True)
read_file_info_report = interaction_set.get_read_file_info_report()
read_file_info_table_row = interaction_set.get_read_file_info_table_row()
print()
# Write interactions that occur in the required number of replicates to file
f_name_interactions = out_prefix + "_at_least_" + str(required_replicates) + "_combined_interactions.tsv.gz"
interaction_set.write_diachromatic_interaction_file(target_file=f_name_interactions,
required_replicates=required_replicates, verbose=True)
write_file_info_report = interaction_set.get_write_file_info_report()
write_file_info_table_row = interaction_set.get_write_file_info_table_row()
# Create file with summary statistics
#####################################
f_name_summary = out_prefix + "_at_least_" + str(required_replicates) + "_combined_summary.txt"
out_fh = open(f_name_summary, 'wt')
# Chosen parameters
out_fh.write(parameter_info + '\n')
# Report on reading files
out_fh.write(read_file_info_report + '\n')
out_fh.write(read_file_info_table_row + '\n')
# Report on writing the file
out_fh.write(write_file_info_report + '\n')
out_fh.write(write_file_info_table_row + '\n')
# Report on generated files
generated_file_info = "[INFO] Generated files:" + '\n'
generated_file_info += "\t[INFO] " + f_name_summary + '\n'
generated_file_info += "\t[INFO] " + f_name_interactions + '\n'
out_fh.write(generated_file_info)
out_fh.close()