Skip to content

Commit 8feed1a

Browse files
committed
Add option to disable reference TE detection, add dry run option, implement tests for input files
1 parent b8a0896 commit 8feed1a

File tree

5 files changed

+509
-17
lines changed

5 files changed

+509
-17
lines changed

README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,19 +69,33 @@ python TEforest.py \
6969
--ref_model <path/to/reference_model.pkl> \
7070
--fq_base_path <path/to/fastq/files> \
7171
--cleanup_intermediates \
72+
--disable_reference_detection \
73+
--dry_run \
7274
--samples A1 A2 A3
7375
```
7476

7577
- **`--workflow_dir`**: Directory containing the `Snakefile` (`workflow/Snakefile`).
7678
- **`--workdir`**: Directory to store outputs and logs.
7779
- **`--threads`**: Number of CPU threads to use. 16 per sample is recommended.
7880
- **`--consensusTEs`, `--ref_genome`, `--ref_te_locations`, `--euchromatin`**: Input reference files for TE detection. All calls outside of the regions denoted in euchromatin will be filtered. Example files used for Drosophila melanogaster are located in example_files/. Be aware the BWA-mem2 will treat IUPAC bases as missing, so TEforest may have reduced performance on consensus sequences with high IUPAC content.
81+
- Current reference BED usage in inference: columns 1/2/3 are used as genomic coordinates, and column 7 is used as the TE family ID.
82+
- Columns 4/5/6 and any trailing columns are accepted but are not used by the pipeline.
83+
- BED can be tab-delimited or whitespace-delimited.
7984
- **`--model`**: Path to the non-reference model (optional). If omitted, TEforest auto-selects a model based on the observed coverage (5X/10X/20X/30X/40X/50X). If the data are not downsampled (e.g., 48X), the next highest model is chosen (50X).
8085
- **`--ref_model`**: Path to the reference model (optional). Auto-selection follows the same coverage logic as above.
8186
- **`--fq_base_path`**: Directory containing FASTQ files. TEforest will match common read naming conventions (e.g., `_R1/_R2`, `_1/_2`, `.1/.2`, `R1/R2`, lane tokens like `_L001_R1_001`) as long as the sample name appears in the filename.
8287
- **`--cleanup_intermediates`**: Optional flag to delete large intermediate files after they are used (e.g., `fastp/`, `aligned/`, `downsampled/`, `candidate_regions_data/`). Omit this if you want to keep read alignments or candidate-region BAMs for debugging.
88+
- **`--disable_reference_detection`**: Optional flag to skip reference TE feature-vector creation and reference model prediction. This can be useful for genomes with very large numbers of old reference TEs, where reference detection can dominate runtime.
89+
- **`--dry_run`**: Optional flag to run `snakemake --dry-run` through the wrapper, so you can validate file naming, inputs, and DAG construction before launching compute-heavy jobs.
8390
- **`--samples`**: List of sample identifiers to process (space-separated). Note more than one sample can be run in parallel.
8491

92+
Input validation (runs for both normal execution and `--dry_run`):
93+
- FASTQs are resolved per sample/read, must exist, be non-empty, and have a valid first FASTQ record.
94+
- Reference BED must have at least 7 whitespace-delimited columns.
95+
- Reference BED chromosome names (column 1) must match sequence headers in `ref_genome`.
96+
- Every TE ID in reference BED column 7 must be present in `consensusTEs` FASTA headers.
97+
- Extra TE families in the consensus FASTA are allowed.
98+
8599
The script will generate:
86100
- A `config.yaml` in your specified `workdir` with all parameters.
87101
- Intermediate files used to run the pipeline

TEforest.py

Lines changed: 269 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,239 @@
11
#!/usr/bin/env python3
22

33
import argparse
4+
import gzip
45
import os
6+
import re
57
import sys
68
import subprocess
79
import textwrap
810
import yaml # Ensure PyYAML is installed: pip install pyyaml
911
import logging
1012

13+
FASTQ_EXTS = (".fastq.gz", ".fq.gz", ".fastq", ".fq")
14+
15+
16+
def normalize_te_id(raw_id):
17+
return raw_id.strip().split()[0].replace("-", "_")
18+
19+
20+
def _strip_fastq_ext(name):
21+
lower = name.lower()
22+
for ext in FASTQ_EXTS:
23+
if lower.endswith(ext):
24+
return name[: -len(ext)]
25+
return name
26+
27+
28+
def _sample_remainder(name, sample):
29+
if name.startswith(sample):
30+
return name[len(sample):], 0
31+
pattern = re.compile(rf"(?:^|[_.-]){re.escape(sample)}(?:[_.-]|$)")
32+
match = pattern.search(name)
33+
if not match:
34+
return None, None
35+
start = match.start()
36+
if name[start] in "._-":
37+
start += 1
38+
return name[start + len(sample):], 1
39+
40+
41+
def _infer_read_token(remainder):
42+
if remainder is None:
43+
return None
44+
rem = remainder.lstrip("._-")
45+
patterns = [
46+
(1, r"(?:^|[_.-])R1(?:[_.-]|$)"),
47+
(2, r"(?:^|[_.-])R2(?:[_.-]|$)"),
48+
(1, r"(?:^|[_.-])1(?:[_.-]|$)"),
49+
(2, r"(?:^|[_.-])2(?:[_.-]|$)"),
50+
]
51+
for read, pattern in patterns:
52+
if re.search(pattern, rem):
53+
return read
54+
return None
55+
56+
57+
def _list_fastq_files(base_path):
58+
files = []
59+
for name in os.listdir(base_path):
60+
lower = name.lower()
61+
if any(lower.endswith(ext) for ext in FASTQ_EXTS):
62+
files.append(name)
63+
return files
64+
65+
66+
def find_fastq_path(base_path, sample, read):
67+
matches = []
68+
for name in _list_fastq_files(base_path):
69+
stem = _strip_fastq_ext(name)
70+
remainder, priority = _sample_remainder(stem, sample)
71+
if remainder is None:
72+
continue
73+
read_token = _infer_read_token(remainder)
74+
if read_token is None:
75+
continue
76+
if str(read_token) == str(read):
77+
matches.append((priority, name))
78+
if not matches:
79+
raise ValueError(
80+
f"No valid FASTQ found for sample '{sample}' read {read} in '{base_path}'."
81+
)
82+
matches.sort(key=lambda item: (item[0], item[1]))
83+
if len(matches) > 1:
84+
names = ", ".join(m[1] for m in matches)
85+
raise ValueError(
86+
f"Multiple FASTQs matched sample '{sample}' read {read}: {names}. "
87+
"Please merge lane files or provide one FASTQ per read."
88+
)
89+
return os.path.join(base_path, matches[0][1])
90+
91+
92+
def validate_fastq_file(path):
93+
if not os.path.isfile(path):
94+
raise ValueError(f"FASTQ file not found: {path}")
95+
if os.path.getsize(path) == 0:
96+
raise ValueError(f"FASTQ file is empty: {path}")
97+
98+
opener = gzip.open if path.lower().endswith(".gz") else open
99+
try:
100+
with opener(path, "rt", encoding="utf-8", errors="replace") as handle:
101+
lines = [handle.readline() for _ in range(4)]
102+
except OSError as exc:
103+
raise ValueError(f"FASTQ file is unreadable or invalid gzip: {path}") from exc
104+
105+
if any(line == "" for line in lines):
106+
raise ValueError(f"FASTQ appears truncated (missing first record): {path}")
107+
108+
header, seq, plus, qual = [line.rstrip("\r\n") for line in lines]
109+
if not header.startswith("@"):
110+
raise ValueError(f"FASTQ first header does not start with '@': {path}")
111+
if not plus.startswith("+"):
112+
raise ValueError(f"FASTQ third line does not start with '+': {path}")
113+
if len(seq) == 0 or len(qual) == 0:
114+
raise ValueError(f"FASTQ first record has empty sequence/quality: {path}")
115+
116+
117+
def parse_consensus_ids(consensus_path):
118+
opener = gzip.open if consensus_path.lower().endswith(".gz") else open
119+
ids = set()
120+
with opener(consensus_path, "rt", encoding="utf-8", errors="replace") as handle:
121+
for line in handle:
122+
if line.startswith(">"):
123+
te_id = normalize_te_id(line[1:].strip())
124+
if te_id:
125+
ids.add(te_id)
126+
if not ids:
127+
raise ValueError(
128+
f"No FASTA headers were found in consensus TE file: {consensus_path}"
129+
)
130+
return ids
131+
132+
133+
def parse_reference_contigs(ref_genome_path):
134+
opener = gzip.open if ref_genome_path.lower().endswith(".gz") else open
135+
contigs = set()
136+
with opener(ref_genome_path, "rt", encoding="utf-8", errors="replace") as handle:
137+
for line in handle:
138+
if line.startswith(">"):
139+
header = line[1:].strip()
140+
if header:
141+
contig = header.split()[0]
142+
if contig:
143+
contigs.add(contig)
144+
if not contigs:
145+
raise ValueError(
146+
f"No FASTA headers were found in reference genome file: {ref_genome_path}"
147+
)
148+
return contigs
149+
150+
151+
def validate_reference_te_ids(ref_bed_path, consensus_path, ref_genome_path):
152+
consensus_ids = parse_consensus_ids(consensus_path)
153+
ref_contigs = parse_reference_contigs(ref_genome_path)
154+
bed_ids = set()
155+
missing_contigs = set()
156+
with open(ref_bed_path, "r", encoding="utf-8", errors="replace") as handle:
157+
for line_num, line in enumerate(handle, start=1):
158+
stripped = line.strip()
159+
if not stripped or stripped.startswith("#"):
160+
continue
161+
# Accept both tab-delimited BED and whitespace-delimited BED-like files.
162+
cols = stripped.split()
163+
if len(cols) < 7:
164+
raise ValueError(
165+
f"Reference BED must have at least 7 columns (line {line_num}): {ref_bed_path}"
166+
)
167+
try:
168+
int(cols[1])
169+
int(cols[2])
170+
except ValueError as exc:
171+
raise ValueError(
172+
f"Reference BED columns 2 and 3 must be integers (line {line_num}): {ref_bed_path}"
173+
) from exc
174+
175+
if cols[0] not in ref_contigs:
176+
missing_contigs.add(cols[0])
177+
178+
te_id = normalize_te_id(cols[6])
179+
if not te_id:
180+
raise ValueError(
181+
f"Reference BED column 7 TE ID is empty (line {line_num}): {ref_bed_path}"
182+
)
183+
bed_ids.add(te_id)
184+
185+
if missing_contigs:
186+
missing_contigs = sorted(missing_contigs)
187+
preview = ", ".join(missing_contigs[:10])
188+
extra = (
189+
""
190+
if len(missing_contigs) <= 10
191+
else f" ... (+{len(missing_contigs) - 10} more)"
192+
)
193+
raise ValueError(
194+
"Reference BED chromosomes not found in reference genome FASTA headers: "
195+
f"{preview}{extra}"
196+
)
197+
198+
if not bed_ids:
199+
raise ValueError(f"No TE IDs were found in column 7 of BED: {ref_bed_path}")
200+
201+
missing_ids = sorted(bed_ids - consensus_ids)
202+
if missing_ids:
203+
preview = ", ".join(missing_ids[:10])
204+
extra = "" if len(missing_ids) <= 10 else f" ... (+{len(missing_ids) - 10} more)"
205+
raise ValueError(
206+
"Reference BED TE IDs (column 7) not found in consensusTEs FASTA: "
207+
f"{preview}{extra}"
208+
)
209+
210+
211+
def validate_inputs(args):
212+
required_files = {
213+
"consensusTEs": args.consensusTEs,
214+
"ref_genome": args.ref_genome,
215+
"ref_te_locations": args.ref_te_locations,
216+
"euchromatin": args.euchromatin,
217+
}
218+
for label, path in required_files.items():
219+
if not os.path.isfile(path):
220+
raise ValueError(f"{label} file not found: {path}")
221+
222+
if not os.path.isdir(args.fq_base_path):
223+
raise ValueError(f"FASTQ base directory not found: {args.fq_base_path}")
224+
225+
validate_reference_te_ids(
226+
args.ref_te_locations, args.consensusTEs, args.ref_genome
227+
)
228+
229+
for sample in args.samples:
230+
fq1 = find_fastq_path(args.fq_base_path, sample, 1)
231+
fq2 = find_fastq_path(args.fq_base_path, sample, 2)
232+
validate_fastq_file(fq1)
233+
validate_fastq_file(fq2)
234+
logging.info(f"Validated FASTQs for {sample}: {fq1} | {fq2}")
235+
236+
11237
def main():
12238
# Setup logging
13239
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -111,6 +337,16 @@ def main():
111337
action="store_true",
112338
help="Delete large intermediate files (fastp/, aligned/, downsampled/, candidate_regions_data/) after use."
113339
)
340+
parser.add_argument(
341+
"--disable_reference_detection",
342+
action="store_true",
343+
help="Disable reference TE detection (skip reference feature vectors and reference model prediction)."
344+
)
345+
parser.add_argument(
346+
"--dry_run",
347+
action="store_true",
348+
help="Run Snakemake in dry-run mode to validate DAG and inputs without executing jobs."
349+
)
114350
#parser.add_argument(
115351
# "--target_coverage",
116352
# type=int,
@@ -131,6 +367,12 @@ def main():
131367
logging.error(f"Snakefile not found in workflow directory '{args.workflow_dir}'. Expected at '{snakefile_path}'.")
132368
sys.exit(1)
133369

370+
try:
371+
validate_inputs(args)
372+
except ValueError as exc:
373+
logging.error(f"Input validation failed: {exc}")
374+
sys.exit(1)
375+
134376
# Ensure working directory exists
135377
if not os.path.isdir(args.workdir):
136378
logging.info(f"Working directory '{args.workdir}' does not exist. Creating it.")
@@ -154,6 +396,7 @@ def main():
154396
#"target_coverage": args.target_coverage,
155397
"target_coverage": 50,
156398
"cleanup_intermediates": args.cleanup_intermediates,
399+
"disable_reference_detection": args.disable_reference_detection,
157400

158401
}
159402

@@ -173,7 +416,30 @@ def main():
173416
os.makedirs(output_dir, exist_ok=True)
174417

175418

176-
# 3) Unlock working directory
419+
# 3) Prepare the Snakemake command
420+
snakemake_cmd = [
421+
"snakemake",
422+
] + output_targets + [
423+
"--cores", str(args.threads),
424+
"-s", snakefile_path,
425+
"--configfile", os.path.abspath(config_path)
426+
]
427+
428+
if args.dry_run:
429+
snakemake_cmd.append("--dry-run")
430+
logging.info("Running Snakemake dry-run command:\n" + " ".join(snakemake_cmd))
431+
try:
432+
subprocess.run(
433+
snakemake_cmd,
434+
cwd=args.workdir,
435+
check=True
436+
)
437+
except subprocess.CalledProcessError as e:
438+
logging.error(f"Snakemake dry-run failed with exit code {e.returncode}")
439+
sys.exit(e.returncode)
440+
return
441+
442+
# 4) Unlock working directory
177443
snakemake_cmd_unlock = [
178444
"snakemake",
179445
] + output_targets + [
@@ -193,19 +459,10 @@ def main():
193459
logging.error(f"Snakemake failed with exit code {e.returncode}")
194460
sys.exit(e.returncode)
195461

196-
# 4) Prepare the Snakemake command
197-
snakemake_cmd = [
198-
"snakemake",
199-
] + output_targets + [
200-
"--cores", str(args.threads),
201-
"-s", snakefile_path,
202-
"--configfile", os.path.abspath(config_path)
203-
]
204-
205-
# 4) Print the command for user visibility
462+
# 5) Print the command for user visibility
206463
logging.info("Running Snakemake command:\n" + " ".join(snakemake_cmd))
207464

208-
# 5) Execute the Snakemake command from the workflow directory
465+
# 6) Execute the Snakemake command from the workflow directory
209466
try:
210467
subprocess.run(
211468
snakemake_cmd,

0 commit comments

Comments
 (0)