Skip to content

Commit 3ff0d68

Browse files
committed
Handle subgraphs and subgraphs output to fasta
1 parent 1bf56fc commit 3ff0d68

File tree

20 files changed

+282
-52
lines changed

20 files changed

+282
-52
lines changed

.coveragerc

+5-1
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,16 @@ omit =
66
cortexpy/test/*
77

88
[report]
9+
skip_covered = True
910
exclude_lines =
1011
no cov
1112
no qa
1213
noqa
1314
pragma: no cover
14-
if __name__ == .__main__.:
15+
16+
# Don't complain if tests don't hit defensive assertion code:
17+
raise AssertionError
18+
raise NotImplementedError
1519

1620
[html]
1721
directory = coverage_html_report

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -104,3 +104,4 @@ ENV/
104104
.idea/
105105

106106
scratch/
107+
coverage_html_report/

Makefile

+1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ UNIT_TEST_COMMAND = $(RUN_IN_ENV) pytest \
99
--flake8 \
1010
--cov=cortexpy \
1111
--cov-report term-missing \
12+
--cov-report html \
1213
--hypothesis-profile $(HYPOTHESIS_PROFILE)
1314
TEST_COMMAND = BIN_DIR=$(BIN_DIR) $(UNIT_TEST_COMMAND)
1415
BENCHMARK_DIR := cortex_tools_benchmark

cortexpy/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
__version__ = '0.10.0'
1+
__version__ = '0.11.0'
22
VERSION_STRING = 'cortexpy version {}'.format(__version__)

cortexpy/command/prune.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@
1313
graph A cortexpy graph. '-' redirects to or from stdout.
1414
"""
1515
from cortexpy.graph import Interactor
16+
import logging
17+
18+
from cortexpy.utils import get_graph_stream_iterator
19+
20+
logger = logging.getLogger(__name__)
1621

1722

1823
def validate_prune(args):
@@ -41,10 +46,10 @@ def prune(argv):
4146
output = open(args['--out'], 'wb')
4247

4348
if args['<graph>'] == '-':
44-
graph = nx.read_gpickle(sys.stdin.buffer)
49+
graphs = get_graph_stream_iterator(sys.stdin.buffer)
4550
else:
46-
graph = nx.read_gpickle(args['<graph>'])
47-
48-
Interactor(graph, colors=None).prune_tips_less_than(args['--remove-tips'])
51+
graphs = get_graph_stream_iterator(open(args['<graph>'], 'rb'))
4952

50-
nx.write_gpickle(graph, output)
53+
for graph in graphs:
54+
Interactor(graph, colors=None).prune_tips_less_than(args['--remove-tips'])
55+
nx.write_gpickle(graph, output)

cortexpy/command/traverse.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
The traverser will follow all colors specified.
1414
--max-nodes <n> Maximum number of nodes to traverse [default: 1000].
1515
--initial-fasta Treat <initial-contigs> as fasta
16+
--subgraphs Emit traversal as sequence of networkx subgraphs
1617
1718
Description:
1819
Traverse a cortex graph starting from each k-mer in an initial_contig and return the subgraph as
@@ -63,4 +64,9 @@ def traverse(argv):
6364
else:
6465
engine.traverse_from_each_kmer_in(args['<initial-contigs>'])
6566

66-
nx.write_gpickle(engine.graph, output)
67+
output_graph = engine.graph
68+
if args['--subgraphs'] and len(output_graph) > 0:
69+
for subgraph in nx.weakly_connected_component_subgraphs(output_graph):
70+
nx.write_gpickle(subgraph, output)
71+
else:
72+
nx.write_gpickle(output_graph, output)

cortexpy/command/view.py

+19-12
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,12 @@
1818
<traversal> is '-'.
1919
2020
"""
21+
import networkx as nx
2122
from docopt import docopt
2223
import logging
2324

25+
from cortexpy.utils import get_graph_stream_iterator
26+
2427
logger = logging.getLogger('cortexpy.view')
2528

2629

@@ -61,24 +64,22 @@ def view_traversal(args):
6164

6265
args = validate_view_traversal(args)
6366

64-
import networkx as nx
65-
6667
if args['<traversal>'] == '-':
67-
graph = nx.read_gpickle(sys.stdin.buffer)
68+
graphs = get_graph_stream_iterator(sys.stdin.buffer)
6869
else:
69-
graph = nx.read_gpickle(args['<traversal>'])
70+
graphs = get_graph_stream_iterator(open(args['<traversal>'], 'rb'))
7071

7172
if args['--to-json']:
73+
graph = nx.compose_all(list(graphs))
7274
print(serializer.Serializer(graph).to_json())
7375
else:
74-
kmer_serializer = serializer.Kmers(graph)
75-
if args['--kmers']:
76-
seq_record_generator = kmer_serializer.to_seq_records()
77-
else:
78-
seq_record_generator = interactor.Contigs(
79-
graph, args['--color']
80-
).all_simple_paths()
81-
SeqIO.write(seq_record_generator, sys.stdout, 'fasta')
76+
for graph_idx, graph in enumerate(graphs):
77+
if args['--kmers']:
78+
seq_record_generator = serializer.Kmers(graph).to_seq_records()
79+
else:
80+
seq_record_generator = interactor.Contigs(graph, args['--color']).all_simple_paths()
81+
seq_record_generator = annotated_seq_records(seq_record_generator, graph_idx=graph_idx)
82+
SeqIO.write(seq_record_generator, sys.stdout, 'fasta')
8283

8384

8485
def view_graph(args):
@@ -120,3 +121,9 @@ def kmer_to_cortex_jdk_print_string(kmer, alt_kmer_string=None):
120121
to_print.append(' ' + ' '.join(map(str, kmer.coverage)))
121122
to_print.append(' ' + ' '.join(edge_set_strings))
122123
return ''.join(to_print)
124+
125+
126+
def annotated_seq_records(seq_record_generator, *, graph_idx):
127+
for rec in seq_record_generator:
128+
rec.id = 'g{}_p{}'.format(graph_idx, rec.id)
129+
yield rec

cortexpy/graph/interactor.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ def prune_tips_less_than(self, n):
3535
tip = find_tip_from(source, n=n, graph=graph, next_node_func=next_node_func)
3636
if tip:
3737
nodes_to_prune |= tip
38+
logger.info('Pruning {} nodes'.format(len(nodes_to_prune)))
3839
self.graph.remove_nodes_from(nodes_to_prune)
3940
return self
4041

@@ -68,7 +69,6 @@ def make_copy_of_color(graph, color, include_self_refs=False):
6869
def convert_kmer_path_to_contig(path):
6970
if len(path) == 0:
7071
return ''
71-
logger.info(path)
7272
contig = [path[0]]
7373
for kmer in path[1:]:
7474
contig.append(kmer[-1])
@@ -95,12 +95,11 @@ class Contigs(object):
9595
color = attr.ib(None)
9696

9797
def all_simple_paths(self):
98-
graph = self.graph
9998
if self.color is not None:
10099
graph = make_copy_of_color(self.graph, self.color, include_self_refs=False)
100+
else:
101+
graph = self.graph
101102
idx = 0
102-
logger.info(graph.nodes())
103-
logger.info(graph.edges())
104103
for source in in_nodes_of(graph):
105104
for target in out_nodes_of(graph):
106105
if source == target:

cortexpy/graph/serializer.py

-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ def to_json(self):
4141
return json.dumps(serializable)
4242

4343
def _collapse_graph_to_unitigs(self):
44-
# self.graph = annotate_kmer_graph_edges(self.graph, self.colors)
4544
self._collapse_kmer_graph()
4645
self._make_unitig_graph_json_representable()
4746

cortexpy/test/builder/graph/body.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,5 @@ def edge_set_string_to_array(edge_set_string):
120120
return np.array(edge_set, dtype=np.uint8)
121121

122122

123-
def print_kmer(kmer):
124-
print(kmer)
125-
print(kmer.kmer)
126-
print(kmer.coverage)
127-
print(kmer.edges)
123+
# def print_kmer(kmer):
124+
# print(kmer)

cortexpy/test/builder/mccortex.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,10 @@ def build(self, tmpdir):
3737
output_graph = str(tmpdir.join('output.ctx'))
3838
mccortex_args.append(output_graph)
3939

40-
runner.Mccortex(self.kmer_size).run(mccortex_args)
40+
ret = runner.Mccortex(self.kmer_size).run(mccortex_args)
41+
logger.debug('\n' + ret.stderr.decode())
4142

42-
runner.Mccortex(self.kmer_size).view(output_graph)
43+
ret = runner.Mccortex(self.kmer_size).view(output_graph)
44+
logger.debug('\n' + ret.stdout.decode())
4345

4446
return output_graph

cortexpy/test/driver/command.py

+101-6
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
from pathlib import Path
22

3+
import Bio
34
import attr
45
import os
6+
7+
import networkx as nx
58
from Bio import SeqIO
69
from Bio.Seq import Seq
710
from Bio.SeqRecord import SeqRecord
@@ -58,13 +61,13 @@ class Prune(object):
5861
tmpdir = attr.ib()
5962
mccortex_builder = attr.ib(attr.Factory(builder.Mccortex))
6063
min_tip_length = attr.ib(None)
61-
last_record = attr.ib(None)
64+
records = attr.ib(attr.Factory(list))
6265
kmer_size = attr.ib(None)
6366

6467
def with_records(self, *records):
6568
for rec in records:
6669
self.mccortex_builder.with_dna_sequence(rec)
67-
self.last_record = rec
70+
self.records.append(SeqRecord(Seq(rec)))
6871
return self
6972

7073
def prune_tips_less_than(self, n):
@@ -77,18 +80,110 @@ def with_kmer_size(self, size):
7780
return self
7881

7982
def run(self):
80-
import networkx as nx
8183
mccortex_graph = self.mccortex_builder.build(self.tmpdir)
8284

8385
cortexpy_graph = self.tmpdir / 'cortexpy_graph.pickle'
84-
initial_contig = self.last_record[0:self.kmer_size]
86+
contig_fasta = self.tmpdir / 'initial_contigs.fa'
87+
with open(str(contig_fasta), 'w') as fh:
88+
SeqIO.write(self.records, fh, 'fasta')
8589
ctp_runner = runner.Cortexpy(SPAWN_PROCESS)
86-
ctp_runner.traverse(graph=mccortex_graph, out=cortexpy_graph, contig=initial_contig)
90+
ctp_runner.traverse(graph=mccortex_graph, out=cortexpy_graph, contig=contig_fasta,
91+
contig_fasta=True, subgraphs=True)
8792

8893
pruned_graph = Path(cortexpy_graph).with_suffix('.pruned.pickle')
8994
completed_process = ctp_runner.prune(graph=cortexpy_graph, out=pruned_graph,
9095
remove_tips=self.min_tip_length)
9196

9297
assert completed_process.returncode == 0, completed_process
9398

94-
return expectation.KmerGraphExpectation(nx.read_gpickle(str(pruned_graph)))
99+
subgraphs = list(load_graph_stream(str(pruned_graph)))
100+
return expectation.graph.KmerGraphsExpectation(subgraphs)
101+
102+
103+
@attr.s(slots=True)
104+
class Traverse(object):
105+
"""Runner for traverse acceptance tests"""
106+
tmpdir = attr.ib()
107+
mccortex_builder = attr.ib(attr.Factory(builder.Mccortex))
108+
traversal_contigs = attr.ib(None)
109+
added_records = attr.ib(attr.Factory(list))
110+
output_subgraphs = attr.ib(False)
111+
cortexpy_graph = attr.ib(init=False)
112+
113+
def with_records(self, *records):
114+
for rec in records:
115+
self.mccortex_builder.with_dna_sequence(rec)
116+
self.added_records.append(rec)
117+
return self
118+
119+
def with_initial_contigs(self, *contigs):
120+
self.traversal_contigs = contigs
121+
122+
def with_subgraph_output(self):
123+
self.output_subgraphs = True
124+
return self
125+
126+
def with_kmer_size(self, size):
127+
self.mccortex_builder.with_kmer_size(size)
128+
return self
129+
130+
def run(self):
131+
mccortex_graph = self.mccortex_builder.build(self.tmpdir)
132+
contig_fasta = self.tmpdir / 'cortexpy_initial_contigs.fa'
133+
if self.traversal_contigs:
134+
initial_contigs = self.traversal_contigs
135+
else:
136+
initial_contigs = self.added_records
137+
with open(str(contig_fasta), 'w') as fh:
138+
Bio.SeqIO.write([SeqRecord(Seq(s)) for s in initial_contigs], fh, 'fasta')
139+
140+
self.cortexpy_graph = self.tmpdir / 'cortexpy_graph.pickle'
141+
ctp_runner = runner.Cortexpy(SPAWN_PROCESS)
142+
completed_process = ctp_runner.traverse(graph=mccortex_graph,
143+
out=self.cortexpy_graph,
144+
contig=contig_fasta,
145+
contig_fasta=True,
146+
subgraphs=self.output_subgraphs)
147+
148+
subgraphs = list(load_graph_stream(self.cortexpy_graph))
149+
assert completed_process.returncode == 0, completed_process
150+
151+
return expectation.graph.KmerGraphsExpectation(subgraphs)
152+
153+
154+
@attr.s(slots=True)
155+
class ViewTraversal(object):
156+
"""Runner for view of traversal acceptance tests"""
157+
tmpdir = attr.ib()
158+
traverse_driver = attr.ib(init=False)
159+
160+
def __attrs_post_init__(self):
161+
self.traverse_driver = Traverse(self.tmpdir)
162+
163+
def with_records(self, *records):
164+
self.traverse_driver.with_records(*records)
165+
return self
166+
167+
def with_subgraph_output(self):
168+
self.traverse_driver.with_subgraph_output()
169+
return self
170+
171+
def with_kmer_size(self, size):
172+
self.traverse_driver.with_kmer_size(size)
173+
return self
174+
175+
def run(self):
176+
self.traverse_driver.run()
177+
ret = runner.Cortexpy(SPAWN_PROCESS).view(
178+
cortexpy_graph=self.traverse_driver.cortexpy_graph)
179+
assert ret.returncode == 0, ret
180+
return expectation.Fasta(ret.stdout)
181+
182+
183+
def load_graph_stream(path):
184+
with open(str(path), 'rb') as fh:
185+
while True:
186+
try:
187+
yield nx.read_gpickle(fh)
188+
except EOFError:
189+
break

cortexpy/test/expectation/fasta.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
import io
2-
3-
import sys
42
from Bio import SeqIO
53
import attr
4+
import logging
5+
6+
logger = logging.getLogger(__name__)
67

78

89
@attr.s(slots=True)
@@ -21,12 +22,16 @@ class Fasta(object):
2122
fasta_records = attr.ib(init=False)
2223

2324
def __attrs_post_init__(self):
24-
print(self.fasta_string, file=sys.stderr)
25+
logger.info(self.fasta_string)
2526
self.fasta_records = [rec for rec in SeqIO.parse(io.StringIO(self.fasta_string), "fasta")]
26-
print(self.fasta_records, file=sys.stderr)
27+
logger.info(self.fasta_records)
2728

2829
def has_n_records(self, n):
29-
assert len(self.fasta_records) == n
30+
assert n == len(self.fasta_records)
31+
return self
32+
33+
def has_record_ids(self, *ids):
34+
assert set(ids) == {rec.id for rec in self.fasta_records}
3035
return self
3136

3237
def has_record(self, expected_record_seq):

0 commit comments

Comments
 (0)