SchriderLab
diff --git a/‎README.md‎
Lines changed: 14 additions & 0 deletions b/‎README.md‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎TEforest.py‎
Lines changed: 269 additions & 12 deletions b/‎TEforest.py‎
Lines changed: 269 additions & 12 deletions
@@ -69,19 +69,33 @@ python TEforest.py \
     --ref_model <path/to/reference_model.pkl> \
     --fq_base_path <path/to/fastq/files> \
     --cleanup_intermediates \
+    --disable_reference_detection \
+    --dry_run \
     --samples A1 A2 A3
 ```
 
 - **`--workflow_dir`**: Directory containing the `Snakefile` (`workflow/Snakefile`).
 - **`--workdir`**: Directory to store outputs and logs.
 - **`--threads`**: Number of CPU threads to use. 16 per sample is recommended.
 - **`--consensusTEs`, `--ref_genome`, `--ref_te_locations`, `--euchromatin`**: Input reference files for TE detection. All calls outside of the regions denoted in euchromatin will be filtered. Example files used for Drosophila melanogaster are located in example_files/. Be aware the BWA-mem2 will treat IUPAC bases as missing, so TEforest may have reduced performance on consensus sequences with high IUPAC content.  
+  - Current reference BED usage in inference: columns 1/2/3 are used as genomic coordinates, and column 7 is used as the TE family ID.
+  - Columns 4/5/6 and any trailing columns are accepted but are not used by the pipeline.
+  - BED can be tab-delimited or whitespace-delimited.
 - **`--model`**: Path to the non-reference model (optional). If omitted, TEforest auto-selects a model based on the observed coverage (5X/10X/20X/30X/40X/50X). If the data are not downsampled (e.g., 48X), the next highest model is chosen (50X).  
 - **`--ref_model`**: Path to the reference model (optional). Auto-selection follows the same coverage logic as above.
 - **`--fq_base_path`**: Directory containing FASTQ files. TEforest will match common read naming conventions (e.g., `_R1/_R2`, `_1/_2`, `.1/.2`, `R1/R2`, lane tokens like `_L001_R1_001`) as long as the sample name appears in the filename.
 - **`--cleanup_intermediates`**: Optional flag to delete large intermediate files after they are used (e.g., `fastp/`, `aligned/`, `downsampled/`, `candidate_regions_data/`). Omit this if you want to keep read alignments or candidate-region BAMs for debugging.
+- **`--disable_reference_detection`**: Optional flag to skip reference TE feature-vector creation and reference model prediction. This can be useful for genomes with very large numbers of old reference TEs, where reference detection can dominate runtime.
+- **`--dry_run`**: Optional flag to run `snakemake --dry-run` through the wrapper, so you can validate file naming, inputs, and DAG construction before launching compute-heavy jobs.
 - **`--samples`**: List of sample identifiers to process (space-separated). Note more than one sample can be run in parallel. 
 
+Input validation (runs for both normal execution and `--dry_run`):
+- FASTQs are resolved per sample/read, must exist, be non-empty, and have a valid first FASTQ record.
+- Reference BED must have at least 7 whitespace-delimited columns.
+- Reference BED chromosome names (column 1) must match sequence headers in `ref_genome`.
+- Every TE ID in reference BED column 7 must be present in `consensusTEs` FASTA headers.
+- Extra TE families in the consensus FASTA are allowed.
+
 The script will generate:
 - A `config.yaml` in your specified `workdir` with all parameters.
 - Intermediate files used to run the pipeline
 
@@ -1,13 +1,239 @@
 #!/usr/bin/env python3
 
 import argparse
+import gzip
 import os
+import re
 import sys
 import subprocess
 import textwrap
 import yaml  # Ensure PyYAML is installed: pip install pyyaml
 import logging
 
+FASTQ_EXTS = (".fastq.gz", ".fq.gz", ".fastq", ".fq")
+
+
+def normalize_te_id(raw_id):
+    return raw_id.strip().split()[0].replace("-", "_")
+
+
+def _strip_fastq_ext(name):
+    lower = name.lower()
+    for ext in FASTQ_EXTS:
+        if lower.endswith(ext):
+            return name[: -len(ext)]
+    return name
+
+
+def _sample_remainder(name, sample):
+    if name.startswith(sample):
+        return name[len(sample):], 0
+    pattern = re.compile(rf"(?:^|[_.-]){re.escape(sample)}(?:[_.-]|$)")
+    match = pattern.search(name)
+    if not match:
+        return None, None
+    start = match.start()
+    if name[start] in "._-":
+        start += 1
+    return name[start + len(sample):], 1
+
+
+def _infer_read_token(remainder):
+    if remainder is None:
+        return None
+    rem = remainder.lstrip("._-")
+    patterns = [
+        (1, r"(?:^|[_.-])R1(?:[_.-]|$)"),
+        (2, r"(?:^|[_.-])R2(?:[_.-]|$)"),
+        (1, r"(?:^|[_.-])1(?:[_.-]|$)"),
+        (2, r"(?:^|[_.-])2(?:[_.-]|$)"),
+    ]
+    for read, pattern in patterns:
+        if re.search(pattern, rem):
+            return read
+    return None
+
+
+def _list_fastq_files(base_path):
+    files = []
+    for name in os.listdir(base_path):
+        lower = name.lower()
+        if any(lower.endswith(ext) for ext in FASTQ_EXTS):
+            files.append(name)
+    return files
+
+
+def find_fastq_path(base_path, sample, read):
+    matches = []
+    for name in _list_fastq_files(base_path):
+        stem = _strip_fastq_ext(name)
+        remainder, priority = _sample_remainder(stem, sample)
+        if remainder is None:
+            continue
+        read_token = _infer_read_token(remainder)
+        if read_token is None:
+            continue
+        if str(read_token) == str(read):
+            matches.append((priority, name))
+    if not matches:
+        raise ValueError(
+            f"No valid FASTQ found for sample '{sample}' read {read} in '{base_path}'."
+        )
+    matches.sort(key=lambda item: (item[0], item[1]))
+    if len(matches) > 1:
+        names = ", ".join(m[1] for m in matches)
+        raise ValueError(
+            f"Multiple FASTQs matched sample '{sample}' read {read}: {names}. "
+            "Please merge lane files or provide one FASTQ per read."
+        )
+    return os.path.join(base_path, matches[0][1])
+
+
+def validate_fastq_file(path):
+    if not os.path.isfile(path):
+        raise ValueError(f"FASTQ file not found: {path}")
+    if os.path.getsize(path) == 0:
+        raise ValueError(f"FASTQ file is empty: {path}")
+
+    opener = gzip.open if path.lower().endswith(".gz") else open
+    try:
+        with opener(path, "rt", encoding="utf-8", errors="replace") as handle:
+            lines = [handle.readline() for _ in range(4)]
+    except OSError as exc:
+        raise ValueError(f"FASTQ file is unreadable or invalid gzip: {path}") from exc
+
+    if any(line == "" for line in lines):
+        raise ValueError(f"FASTQ appears truncated (missing first record): {path}")
+
+    header, seq, plus, qual = [line.rstrip("\r\n") for line in lines]
+    if not header.startswith("@"):
+        raise ValueError(f"FASTQ first header does not start with '@': {path}")
+    if not plus.startswith("+"):
+        raise ValueError(f"FASTQ third line does not start with '+': {path}")
+    if len(seq) == 0 or len(qual) == 0:
+        raise ValueError(f"FASTQ first record has empty sequence/quality: {path}")
+
+
+def parse_consensus_ids(consensus_path):
+    opener = gzip.open if consensus_path.lower().endswith(".gz") else open
+    ids = set()
+    with opener(consensus_path, "rt", encoding="utf-8", errors="replace") as handle:
+        for line in handle:
+            if line.startswith(">"):
+                te_id = normalize_te_id(line[1:].strip())
+                if te_id:
+                    ids.add(te_id)
+    if not ids:
+        raise ValueError(
+            f"No FASTA headers were found in consensus TE file: {consensus_path}"
+        )
+    return ids
+
+
+def parse_reference_contigs(ref_genome_path):
+    opener = gzip.open if ref_genome_path.lower().endswith(".gz") else open
+    contigs = set()
+    with opener(ref_genome_path, "rt", encoding="utf-8", errors="replace") as handle:
+        for line in handle:
+            if line.startswith(">"):
+                header = line[1:].strip()
+                if header:
+                    contig = header.split()[0]
+                    if contig:
+                        contigs.add(contig)
+    if not contigs:
+        raise ValueError(
+            f"No FASTA headers were found in reference genome file: {ref_genome_path}"
+        )
+    return contigs
+
+
+def validate_reference_te_ids(ref_bed_path, consensus_path, ref_genome_path):
+    consensus_ids = parse_consensus_ids(consensus_path)
+    ref_contigs = parse_reference_contigs(ref_genome_path)
+    bed_ids = set()
+    missing_contigs = set()
+    with open(ref_bed_path, "r", encoding="utf-8", errors="replace") as handle:
+        for line_num, line in enumerate(handle, start=1):
+            stripped = line.strip()
+            if not stripped or stripped.startswith("#"):
+                continue
+            # Accept both tab-delimited BED and whitespace-delimited BED-like files.
+            cols = stripped.split()
+            if len(cols) < 7:
+                raise ValueError(
+                    f"Reference BED must have at least 7 columns (line {line_num}): {ref_bed_path}"
+                )
+            try:
+                int(cols[1])
+                int(cols[2])
+            except ValueError as exc:
+                raise ValueError(
+                    f"Reference BED columns 2 and 3 must be integers (line {line_num}): {ref_bed_path}"
+                ) from exc
+
+            if cols[0] not in ref_contigs:
+                missing_contigs.add(cols[0])
+
+            te_id = normalize_te_id(cols[6])
+            if not te_id:
+                raise ValueError(
+                    f"Reference BED column 7 TE ID is empty (line {line_num}): {ref_bed_path}"
+                )
+            bed_ids.add(te_id)
+
+    if missing_contigs:
+        missing_contigs = sorted(missing_contigs)
+        preview = ", ".join(missing_contigs[:10])
+        extra = (
+            ""
+            if len(missing_contigs) <= 10
+            else f" ... (+{len(missing_contigs) - 10} more)"
+        )
+        raise ValueError(
+            "Reference BED chromosomes not found in reference genome FASTA headers: "
+            f"{preview}{extra}"
+        )
+
+    if not bed_ids:
+        raise ValueError(f"No TE IDs were found in column 7 of BED: {ref_bed_path}")
+
+    missing_ids = sorted(bed_ids - consensus_ids)
+    if missing_ids:
+        preview = ", ".join(missing_ids[:10])
+        extra = "" if len(missing_ids) <= 10 else f" ... (+{len(missing_ids) - 10} more)"
+        raise ValueError(
+            "Reference BED TE IDs (column 7) not found in consensusTEs FASTA: "
+            f"{preview}{extra}"
+        )
+
+
+def validate_inputs(args):
+    required_files = {
+        "consensusTEs": args.consensusTEs,
+        "ref_genome": args.ref_genome,
+        "ref_te_locations": args.ref_te_locations,
+        "euchromatin": args.euchromatin,
+    }
+    for label, path in required_files.items():
+        if not os.path.isfile(path):
+            raise ValueError(f"{label} file not found: {path}")
+
+    if not os.path.isdir(args.fq_base_path):
+        raise ValueError(f"FASTQ base directory not found: {args.fq_base_path}")
+
+    validate_reference_te_ids(
+        args.ref_te_locations, args.consensusTEs, args.ref_genome
+    )
+
+    for sample in args.samples:
+        fq1 = find_fastq_path(args.fq_base_path, sample, 1)
+        fq2 = find_fastq_path(args.fq_base_path, sample, 2)
+        validate_fastq_file(fq1)
+        validate_fastq_file(fq2)
+        logging.info(f"Validated FASTQs for {sample}: {fq1} | {fq2}")
+
+
 def main():
     # Setup logging
     logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -111,6 +337,16 @@ def main():
         action="store_true",
         help="Delete large intermediate files (fastp/, aligned/, downsampled/, candidate_regions_data/) after use."
     )
+    parser.add_argument(
+        "--disable_reference_detection",
+        action="store_true",
+        help="Disable reference TE detection (skip reference feature vectors and reference model prediction)."
+    )
+    parser.add_argument(
+        "--dry_run",
+        action="store_true",
+        help="Run Snakemake in dry-run mode to validate DAG and inputs without executing jobs."
+    )
     #parser.add_argument(
     #    "--target_coverage",
     #    type=int,
@@ -131,6 +367,12 @@ def main():
         logging.error(f"Snakefile not found in workflow directory '{args.workflow_dir}'. Expected at '{snakefile_path}'.")
         sys.exit(1)
 
+    try:
+        validate_inputs(args)
+    except ValueError as exc:
+        logging.error(f"Input validation failed: {exc}")
+        sys.exit(1)
+
     # Ensure working directory exists
     if not os.path.isdir(args.workdir):
         logging.info(f"Working directory '{args.workdir}' does not exist. Creating it.")
@@ -154,6 +396,7 @@ def main():
         #"target_coverage": args.target_coverage,
         "target_coverage": 50,
         "cleanup_intermediates": args.cleanup_intermediates,
+        "disable_reference_detection": args.disable_reference_detection,
 
     }
 
@@ -173,7 +416,30 @@ def main():
     os.makedirs(output_dir, exist_ok=True)
 
 
-    # 3) Unlock working directory
+    # 3) Prepare the Snakemake command
+    snakemake_cmd = [
+        "snakemake",
+    ] + output_targets + [
+        "--cores", str(args.threads),
+        "-s", snakefile_path,
+        "--configfile", os.path.abspath(config_path)
+    ]
+
+    if args.dry_run:
+        snakemake_cmd.append("--dry-run")
+        logging.info("Running Snakemake dry-run command:\n" + " ".join(snakemake_cmd))
+        try:
+            subprocess.run(
+                snakemake_cmd,
+                cwd=args.workdir,
+                check=True
+            )
+        except subprocess.CalledProcessError as e:
+            logging.error(f"Snakemake dry-run failed with exit code {e.returncode}")
+            sys.exit(e.returncode)
+        return
+
+    # 4) Unlock working directory
     snakemake_cmd_unlock = [
         "snakemake",
     ] + output_targets + [
@@ -193,19 +459,10 @@ def main():
         logging.error(f"Snakemake failed with exit code {e.returncode}")
         sys.exit(e.returncode)
 
-    # 4) Prepare the Snakemake command
-    snakemake_cmd = [
-        "snakemake",
-    ] + output_targets + [
-        "--cores", str(args.threads),
-        "-s", snakefile_path,
-        "--configfile", os.path.abspath(config_path)
-    ]
-
-    # 4) Print the command for user visibility
+    # 5) Print the command for user visibility
     logging.info("Running Snakemake command:\n" + " ".join(snakemake_cmd))
 
-    # 5) Execute the Snakemake command from the workflow directory
+    # 6) Execute the Snakemake command from the workflow directory
     try:
         subprocess.run(
             snakemake_cmd,