-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcount_overlapping_reads_from_BAM_files.jl
58 lines (49 loc) · 1.88 KB
/
count_overlapping_reads_from_BAM_files.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
using Glob
using XAM
using DataFrames
using CSV
#using BenchmarkTools
function extract_names(files::Vector{String})
df = DataFrame(Sample = String[], Reads = Vector{String}[])
for file in files
sample = split(basename(file), ".")[1]
sample = split(sample, "_")[1] # Note: check if the same sample names can be extracted.
println(sample)
read_name = String[]
reader = open(BAM.Reader, file)
# for record in reader
# push!(read_name, replace(BAM.tempname(record), r"_\w+$" => ""))
# end
record = BAM.Record()
while !eof(reader)
empty!(record)
read!(reader, record)
push!(read_name, replace(BAM.tempname(record), r"_\w+$" => ""))
end
close(reader)
push!(df, [sample, read_name])
end
return df
end
function overlapping(umi::Vector{String}, agent::Vector{String}, p::String)
df_merge = leftjoin(extract_names(umi), extract_names(agent), on = :Sample, makeunique=true)
df_merge.Overlapping = intersect.(df_merge.Reads, df_merge.Reads_1)
df_merge.Umi = df_merge.Reads .|> length
df_merge.UmiUnique = df_merge.Reads .|> unique .|> length
df_merge.Agent = df_merge.Reads_1 .|> length
df_merge.AgentUnique = df_merge.Reads_1 .|> unique .|> length
df_merge.OverlappingSize = df_merge.Overlapping .|> length
#println(df_merge)
CSV.write("$(p)_overlappping_reads_df.csv", df_merge[!, [1, 5, 6, 7, 8, 9]])
end
##### set project, bam file path #####
project = "S94-RD-50"
umi_files = glob("./UMI-tools/aligned-deduplicated/*.bam")
agent_files = glob("./aligned-deduplicated/*.bam")
# Usage: julia -t 46 overlapping_reads_df.jl
# Run it in parallel to save time!
#####
println(umi_files, "\n", agent_files)
# extract_names(umi_files) |> println
# extract_names(agent_files) |> println
overlapping(umi_files, agent_files, project)