-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsimulate_random.sh
More file actions
executable file
·225 lines (187 loc) · 8.66 KB
/
simulate_random.sh
File metadata and controls
executable file
·225 lines (187 loc) · 8.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#!/bin/bash
# Script to simulate <n> reads from a random node in a panman file
# with mutations applied (5 SNPs, 1 insertion, 2 deletions by default)
# Usage: ./simulate_random.sh <panman_file> <n_reads> [seed] [threads]
#
# Arguments:
# panman_file: Path to the .panman file
# n_reads: Number of reads to simulate
# seed: (optional) Random seed for reproducibility
# threads: (optional) Number of threads for simulation (default: 4)
set -e
if [ "$#" -lt 2 ]; then
echo "Usage: $0 <panman_file> <n_reads> [seed] [threads]"
echo " panman_file: Path to the .panman file"
echo " n_reads: Number of reads to simulate"
echo " seed: (optional) Random seed for reproducibility"
echo " threads: (optional) Number of threads for simulation (default: 4)"
exit 1
fi
PANMAN_FILE="$(realpath "$1")"
N_READS="$2"
SEED="${3:-$RANDOM}"
CPUS="${4:-4}"
# Check if panman file exists
if [ ! -f "$PANMAN_FILE" ]; then
echo "Error: Panman file not found: $PANMAN_FILE"
exit 1
fi
# Get script directory and binary paths
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ORIGINAL_DIR="$(pwd)"
PANMAP_BIN="$SCRIPT_DIR/build/bin/panmap"
SIMULATE_BIN="$SCRIPT_DIR/build/bin/simulate"
# Check if binaries exist
if [ ! -f "$PANMAP_BIN" ]; then
echo "Error: panmap binary not found: $PANMAP_BIN"
echo "Please build the project first (./make.sh or cmake + make)"
exit 1
fi
if [ ! -f "$SIMULATE_BIN" ]; then
echo "Error: simulate binary not found: $SIMULATE_BIN"
echo "Please build the project first (./make.sh or cmake + make)"
exit 1
fi
rm -f ./panmap_*.map
rm -f ./simulated_*
# Create temp directory for output
TEMP_DIR=$(mktemp -d -t simulate_random_XXXXXX)
trap "rm -rf $TEMP_DIR" EXIT
# Determine output file prefix early
PANMAN_BASENAME=$(basename "$PANMAN_FILE" .panman)
OUTPUT_PREFIX="simulated_${PANMAN_BASENAME}_${N_READS}reads_seed${SEED}"
echo "[simulate_random] =========================================="
echo "[simulate_random] Panman file: $PANMAN_FILE"
echo "[simulate_random] Number of reads: $N_READS"
echo "[simulate_random] Seed: $SEED"
echo "[simulate_random] Threads: $CPUS"
echo "[simulate_random] Mutations: 5 SNPs, 1 insertion, 2 deletions"
echo "[simulate_random] =========================================="
echo "[simulate_random] Output files will be created in current directory:"
echo " $ORIGINAL_DIR/${OUTPUT_PREFIX}_R1.fastq"
echo " $ORIGINAL_DIR/${OUTPUT_PREFIX}_R2.fastq"
echo " $ORIGINAL_DIR/${OUTPUT_PREFIX}_node.fasta"
echo "[simulate_random] =========================================="
echo ""
# Step 1: Dump a random node sequence from the panman file
echo "[simulate_random] Step 1: Dumping random node from panman..."
# Clean up any old random node files to avoid confusion
PANMAN_DIR=$(dirname "$PANMAN_FILE")
rm -f "$PANMAN_DIR"/*.random.*.fa 2>/dev/null
"$PANMAP_BIN" "$PANMAN_FILE" --dump-random-node --seed "$SEED" -t 8 2>&1 | tee "$TEMP_DIR/dump_log.txt"
# panmap writes the random node file next to the .panman file, so look there
# The filename pattern is: {panman_file}.random.{node_name}.fa
RANDOM_NODE_FA=$(ls "$PANMAN_DIR"/"$(basename "$PANMAN_FILE")".random.*.fa 2>/dev/null | head -n1)
if [ -z "$RANDOM_NODE_FA" ]; then
echo "Error: Failed to generate random node fasta"
echo "Checked directory: $PANMAN_DIR"
echo "Pattern: $(basename "$PANMAN_FILE").random.*.fa"
ls -la "$PANMAN_DIR"/*.random.*.fa 2>/dev/null || echo "No random files found"
exit 1
fi
# Copy to temp directory for simulate to use
cp "$RANDOM_NODE_FA" "$TEMP_DIR/"
RANDOM_NODE_FA="$TEMP_DIR/$(basename "$RANDOM_NODE_FA")"
echo "[simulate_random] Random node fasta: $RANDOM_NODE_FA"
# Extract node name from the fasta header
NODE_NAME=$(head -n1 "$RANDOM_NODE_FA" | sed 's/^>//')
echo "[simulate_random] Selected node: $NODE_NAME"
# Step 2: Simulate reads from the random node
echo "[simulate_random] Step 2: Simulating $N_READS reads with $CPUS threads..."
echo "[simulate_random] Mutations: 5 SNPs, 1 insertion, 2 deletions"
# Default parameters (matching Snakefile defaults)
MODEL="NovaSeq"
# Run simulate with mutations: 5 SNPs, 1 insertion, 2 deletions (3 indels total)
cd "$TEMP_DIR"
"$SIMULATE_BIN" --panmat "$PANMAN_FILE" --ref "$NODE_NAME" --out_dir "$TEMP_DIR" \
--n_reads "$N_READS" --rep 1 --model "$MODEL" --cpus "$CPUS" --seed "$SEED" \
--mutnum 5 1 2 --indel_len 1 9
# Find generated read files
R1_FASTQ=$(find "$TEMP_DIR" -name "*_R1.fastq" 2>/dev/null | head -n1)
R2_FASTQ=$(find "$TEMP_DIR" -name "*_R2.fastq" 2>/dev/null | head -n1)
if [ -z "$R1_FASTQ" ] || [ -z "$R2_FASTQ" ]; then
echo "Error: Failed to generate read files"
exit 1
fi
# Copy output files to original working directory
cp "$R1_FASTQ" "$ORIGINAL_DIR/${OUTPUT_PREFIX}_R1.fastq"
cp "$R2_FASTQ" "$ORIGINAL_DIR/${OUTPUT_PREFIX}_R2.fastq"
cp "$RANDOM_NODE_FA" "$ORIGINAL_DIR/${OUTPUT_PREFIX}_node.fasta"
# Get absolute paths of created files
FINAL_R1="$ORIGINAL_DIR/${OUTPUT_PREFIX}_R1.fastq"
FINAL_R2="$ORIGINAL_DIR/${OUTPUT_PREFIX}_R2.fastq"
FINAL_NODE="$ORIGINAL_DIR/${OUTPUT_PREFIX}_node.fasta"
echo ""
echo "[simulate_random] =========================================="
echo "[simulate_random] SUCCESS! Files created:"
echo " $FINAL_R1"
echo " $FINAL_R2"
echo " $FINAL_NODE"
echo "[simulate_random] =========================================="
echo "[simulate_random] Node: $NODE_NAME"
echo "[simulate_random] Seed: $SEED (use this to reproduce)"
echo ""
echo "[simulate_random] Running placement..."
echo "[simulate_random] Command: build/bin/panmap $PANMAN_FILE ${OUTPUT_PREFIX}_R1.fastq ${OUTPUT_PREFIX}_R2.fastq --stop align -t 1 -k 27 -s 8 -l 1 -f"
echo "[simulate_random] =========================================="
echo ""
# Run placement
cd "$ORIGINAL_DIR"
"$PANMAP_BIN" "$PANMAN_FILE" "${OUTPUT_PREFIX}_R1.fastq" "${OUTPUT_PREFIX}_R2.fastq" --stop genotype
echo ""
echo "[simulate_random] =========================================="
echo "[simulate_random] Placement completed!"
echo "[simulate_random] Expected node: $NODE_NAME"
echo ""
echo "[simulate_random] Checking placement results..."
# Check if the placement file exists
PLACEMENT_FILE="$PANMAN_FILE.placement.tsv"
if [ -f "$PLACEMENT_FILE" ]; then
echo "[simulate_random] Placement results from $PLACEMENT_FILE:"
echo "[simulate_random] ----------------------------------------"
cat "$PLACEMENT_FILE" | column -t
echo "[simulate_random] ----------------------------------------"
# Extract the best node(s) for each metric (may be comma-separated)
# Use -F'\t' to properly handle tab-separated fields with empty values
echo ""
echo "[simulate_random] Summary:"
RAW_NODES=$(grep "^raw" "$PLACEMENT_FILE" | awk -F'\t' '{print $4}')
JACCARD_NODES=$(grep "^jaccard" "$PLACEMENT_FILE" | awk -F'\t' '{print $4}')
COSINE_NODES=$(grep "^cosine" "$PLACEMENT_FILE" | awk -F'\t' '{print $4}')
WEIGHTED_NODES=$(grep "^weighted_jaccard" "$PLACEMENT_FILE" | awk -F'\t' '{print $4}')
# Helper function to check if expected node is in comma-separated list
check_node_in_list() {
local expected="$1"
local node_list="$2"
# Convert comma-separated list to space-separated and check each
echo "$node_list" | tr ',' '\n' | grep -Fxq "$expected"
return $?
}
if check_node_in_list "$NODE_NAME" "$RAW_NODES"; then
echo "[simulate_random] ✓ Raw matches correctly placed at $NODE_NAME"
[ "$RAW_NODES" != "$NODE_NAME" ] && echo " (tied with: $RAW_NODES)"
else
echo "[simulate_random] ✗ Raw matches placed at $RAW_NODES (expected: $NODE_NAME)"
fi
if check_node_in_list "$NODE_NAME" "$JACCARD_NODES"; then
echo "[simulate_random] ✓ Jaccard correctly placed at $NODE_NAME"
[ "$JACCARD_NODES" != "$NODE_NAME" ] && echo " (tied with: $JACCARD_NODES)"
else
echo "[simulate_random] ✗ Jaccard placed at $JACCARD_NODES (expected: $NODE_NAME)"
fi
if check_node_in_list "$NODE_NAME" "$COSINE_NODES"; then
echo "[simulate_random] ✓ Cosine correctly placed at $NODE_NAME"
[ "$COSINE_NODES" != "$NODE_NAME" ] && echo " (tied with: $COSINE_NODES)"
else
echo "[simulate_random] ✗ Cosine placed at $COSINE_NODES (expected: $NODE_NAME)"
fi
if check_node_in_list "$NODE_NAME" "$WEIGHTED_NODES"; then
echo "[simulate_random] ✓ Weighted Jaccard correctly placed at $NODE_NAME"
[ "$WEIGHTED_NODES" != "$NODE_NAME" ] && echo " (tied with: $WEIGHTED_NODES)"
else
echo "[simulate_random] ✗ Weighted Jaccard placed at $WEIGHTED_NODES (expected: $NODE_NAME)"
fi
else
echo "[simulate_random] Warning: Placement file not found: $PLACEMENT_FILE"
fi
echo "[simulate_random] =========================================="