Skip to content

Commit d5adee0

Browse files
committed
demux_just_fwd
1 parent 05036b7 commit d5adee0

File tree

5 files changed

+83
-3
lines changed

5 files changed

+83
-3
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,4 +59,5 @@ dependencies = [
5959
configure_klp = "qp_klp.scripts.configure_klp:config"
6060
start_klp = "qp_klp.scripts.start_klp:execute"
6161
demux = "sequence_processing_pipeline.scripts.cli:demux"
62+
demux_single = "sequence_processing_pipeline.scripts.cli:demux_single"
6263
pacbio_generate_bam2fastq_commands = "qp_klp.scripts.pacbio_commands:generate_bam2fastq_commands"

src/qp_klp/PacBioMetagenomicWorkflow.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from .Assays import ASSAY_NAME_METAGENOMIC
55
from .FailedSamplesRecord import FailedSamplesRecord
66
from .Workflows import Workflow
7+
import pandas as pd
78

89

910
class PacBioMetagenomicWorkflow(Workflow, Metagenomic, PacBio):
@@ -40,7 +41,6 @@ def __init__(self, **kwargs):
4041
self.fsr = FailedSamplesRecord(self.kwargs['output_dir'],
4142
self.pipeline.sample_sheet.samples)
4243

43-
import pandas as pd
4444
samples = [
4545
{'barcode': sample['Sample_ID'],
4646
'sample_name': sample['Sample_Name'],

src/sequence_processing_pipeline/Commands.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,3 +192,82 @@ def demux(id_map, fp, out_d, task, maxtask):
192192
for d in openfps.values():
193193
for f in d.values():
194194
f.close()
195+
196+
197+
def demux_just_fwd(id_map, fp, out_d, task, maxtask):
198+
"""Split infile data based in provided map"""
199+
delimiter = '::MUX::'
200+
mode = 'wt'
201+
ext = '.fastq.gz'
202+
sep = '/'
203+
rec = '@'
204+
205+
openfps = {}
206+
207+
for offset, (idx, r1, outbase) in enumerate(id_map):
208+
if offset % maxtask == task:
209+
idx = rec + idx
210+
211+
# setup output locations
212+
outdir = out_d + sep + outbase
213+
fullname_r1 = outdir + sep + r1 + ext
214+
215+
# we have seen in lustre that sometime this line
216+
# can have a raise condition; making sure it doesn't break
217+
# things
218+
try:
219+
os.makedirs(outdir, exist_ok=True)
220+
except FileExistsError:
221+
pass
222+
current_fp_r1 = gzip.open(fullname_r1, mode)
223+
current_fp = {'1': current_fp_r1}
224+
openfps[idx] = current_fp
225+
226+
# setup a parser
227+
seq_id = iter(fp)
228+
seq = iter(fp)
229+
dumb = iter(fp)
230+
qual = iter(fp)
231+
232+
for i, s, d, q in zip(seq_id, seq, dumb, qual):
233+
# '@1', 'LH00444:84:227CNHLT4:7:1101:41955:2443/1'
234+
# '@1', 'LH00444:84:227CNHLT4:7:1101:41955:2443/1 BX:Z:TATGACACATGCGGCCCT' # noqa
235+
# '@baz/1
236+
237+
# NB: from 6d794a37-12cd-4f8e-95d6-72a4b8a1ec1c's only-adapter-filtered results: # noqa
238+
# @A00953:244:HYHYWDSXY:3:1101:14082:3740 1:N:0:CCGTAAGA+TCTAACGC
239+
240+
fname_encoded, sid = i.split(delimiter, 1)
241+
242+
if fname_encoded not in openfps:
243+
continue
244+
245+
current_fp = openfps[fname_encoded]
246+
247+
# remove '\n' from sid and split on all whitespace.
248+
tmp = sid.strip().split()
249+
250+
if len(tmp) == 1:
251+
# sequence id line contains no optional metadata.
252+
# don't change sid.
253+
# -1 is \n
254+
orientation = sid[-2]
255+
sid = rec + sid
256+
elif len(tmp) == 2:
257+
sid = tmp[0]
258+
metadata = tmp[1]
259+
# no '\n'
260+
orientation = sid[-1]
261+
# hexdump confirms separator is ' ', not '\t'
262+
sid = rec + sid + ' ' + metadata + '\n'
263+
else:
264+
raise ValueError(f"'{sid}' is not a recognized form")
265+
266+
current_fp[orientation].write(sid)
267+
current_fp[orientation].write(s)
268+
current_fp[orientation].write(d)
269+
current_fp[orientation].write(q)
270+
271+
for d in openfps.values():
272+
for f in d.values():
273+
f.close()

src/sequence_processing_pipeline/templates/nuqc_job_single.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ function demux-runner () {
149149

150150
for idx in $(seq 0 ${n_demux_jobs})
151151
do
152-
python {{demux_path}} \
152+
demux_just_fwd \
153153
--id-map ${id_map} \
154154
--infile <(cat ${seqs_r1}) \
155155
--output ${OUTPUT} \

tests/data/configuration_profiles/pacbio_metagenomic.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
"queue": "qiita",
5252
"nthreads": 16,
5353
"wallclock_time_in_minutes": 60,
54-
"modules_to_load": ["fastqc_0.11.5"],
54+
"modules_to_load": ["fastqc_0.12.1"],
5555
"fastqc_executable_path": "fastqc",
5656
"multiqc_executable_path": "multiqc",
5757
"multiqc_config_file_path": "sequence_processing_pipeline/multiqc-bclconvert-config.yaml",

0 commit comments

Comments
 (0)