Skip to content

Commit 39c9ee0

Browse files
authored
read nexus tree blocks: with translate tables and networks (JuliaPhylo#197)
readNexusTrees changed to readnexus_treeblock. assumptions: format used by bacter or by SpeciesNetworks BEAST2 packages (which are different from each other)
1 parent 6b5828e commit 39c9ee0

8 files changed

+252
-60
lines changed

docs/src/lib/public.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ readTopology
8282
readTopologyLevel1
8383
readInputTrees
8484
readMultiTopology
85-
readNexusTrees
85+
readnexus_treeblock
8686
readSnaqNetwork
8787
readTrees2CF
8888
countquartetsintrees

examples/test_reticulatetreeblock.nex

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#NEXUS
2+
3+
begin taxa;
4+
dimensions ntax=2;
5+
taxlabels
6+
A
7+
D
8+
;
9+
End;
10+
11+
begin trees;
12+
13+
translate
14+
5 tax5, [to check robustness]
15+
1 tax1,
16+
2 tax2,
17+
3 tax3,
18+
4 tax4
19+
;
20+
21+
Tree bacter=((4,(3,#7[&conv=1, relSize=0.08, affectedBlocks="{bark,dirty,dry}"]:0.001)[&height_95%_HPD={38.01,87.51}]:0.3):0.6,(2,(1:0.1)#7:0.9):10);
22+
tree gt1 = ((4,(3,#7[&conv=7,relSize=0.58, posterior=0.87]:0.001::0.57):0.3):0.6,(2,(1)#7));
23+
tree fake = this line won't match a newick string
24+
tree gt2 = (error will be caught and warning sent;
25+
tree speciesnetwork = (((1:1.13,((2:0.21)#H1:0.89,(3:1.03,(#H1[&gamma=0.28]:0.30,4:0.51)S3:0.51)S4:0.08)S5:0.2):0.6,5:1.14):0.16);
26+
27+
end;
28+
29+
bacter example: https://raw.githubusercontent.com/taming-the-beast/Bacter-Tutorial/master/precooked_runs/bacter_tutorial.Ecoli_rplA.trees
30+
SpeciesNetwork examples:
31+
MCMC sample: https://raw.githubusercontent.com/mmatschiner/tutorials/master/bayesian_analysis_of_species_networks/res/speciesnetwork.trees
32+
summary: https://raw.githubusercontent.com/mmatschiner/tutorials/master/bayesian_analysis_of_species_networks/res/speciesnetwork_sum.trees

src/PhyloNetworks.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ module PhyloNetworks
6666
writeTableCF,
6767
mapAllelesCFtable,
6868
readInputTrees,
69-
readNexusTrees,
69+
readnexus_treeblock,
7070
summarizeDataCF,
7171
snaq!,
7272
readSnaqNetwork,

src/deprecated.jl

+2-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
@deprecate phyloNetworklm phylolm
2-
@deprecate sigma2_estim sigma2_phylo
3-
@deprecate mu_estim mu_phylo
1+
@deprecate readNexusTrees readnexus_treeblock
2+
43
@deprecate getChild getchild false
54
@deprecate getChildren getchildren false
65
@deprecate getChildEdge getchildedge false

src/readData.jl

+3-47
Original file line numberDiff line numberDiff line change
@@ -963,7 +963,9 @@ function readTrees2CF(treefile::AbstractString; quartetfile="none"::AbstractStri
963963
writeTab=true::Bool, CFfile="none"::AbstractString,
964964
taxa::AbstractVector=Vector{String}(),
965965
writeQ=false::Bool, writeSummary=true::Bool, nexus=false::Bool)
966-
trees = (nexus ? readNexusTrees(treefile, readTopologyUpdate, false, false) : readInputTrees(treefile))
966+
trees = (nexus ?
967+
readnexus_treeblock(treefile, readTopologyUpdate, false, false; reticulate=false) :
968+
readInputTrees(treefile))
967969
if length(taxa)==0 # unionTaxa(trees) NOT default argument:
968970
taxa = unionTaxa(trees) # otherwise: tree file is read twice
969971
end
@@ -1275,49 +1277,3 @@ function createQuartet(taxa::Union{Vector{<:AbstractString},Vector{Int}}, qvec::
12751277
end
12761278
return Quartet(num,names,[1.0,0.0,0.0])
12771279
end
1278-
1279-
"""
1280-
readNexusTrees(filename::AbstractString, treereader=readTopology::Function [, args...])
1281-
1282-
Read trees in nexus-formatted file and return a vector of `HybridNetwork`s.
1283-
For the nexus format, see Maddison, Swofford & Maddison (1997)
1284-
https://doi.org/10.1093/sysbio/46.4.590.
1285-
The optional arguments are passed onto the individual tree reader.
1286-
1287-
Warnings:
1288-
- "translate" tables are not supported yet
1289-
- only the first tree block is read
1290-
"""
1291-
function readNexusTrees(file::AbstractString, treereader=readTopology::Function, args...)
1292-
vnet = HybridNetwork[]
1293-
rx_start = r"^\s*begin\s+trees\s*;"i
1294-
rx_end = r"^\s*end\s*;"i
1295-
rx_tree = r"^\s*tree\s+[^(]+(\([^;]*;)"i
1296-
# spaces,"Tree",spaces,any_symbols_other_than_(, then we capture:
1297-
# ( any_symbols_other_than_; ;
1298-
treeblock = false # whether we are currently reading the TREE block or not
1299-
open(file) do s
1300-
numl = 0
1301-
for line in eachline(s)
1302-
numl += 1
1303-
if treeblock # currently reading trees, check for END signal
1304-
occursin(rx_end, line) && break # break if end of tree block
1305-
else # not reading trees: check for the BEGIN signal
1306-
if occursin(rx_start, line) treeblock=true; end
1307-
continue # to next line, either way
1308-
end
1309-
# if we get there, it's that we are inside the treeblock (true) and no END signal yet
1310-
m = match(rx_tree, line)
1311-
m != nothing || continue # continue to next line if no match
1312-
phy = m.captures[1] # string
1313-
try
1314-
push!(vnet, treereader(phy, args...)) # readTopologyUpdate(phy,false)
1315-
catch err
1316-
print("skipped phylogeny on line $(numl) of file $file: ")
1317-
if :msg in fieldnames(typeof(err)) println(err.msg); else println(typeof(err)); end
1318-
end
1319-
end
1320-
end
1321-
return vnet # consistent output type: HybridNetwork vector. might be of length 0.
1322-
end
1323-

src/readwrite.jl

+201
Original file line numberDiff line numberDiff line change
@@ -1371,6 +1371,207 @@ function readMultiTopology(file::AbstractString, fast::Bool=true)
13711371
return vnet
13721372
end
13731373

1374+
1375+
@doc raw"""
1376+
readnexus_treeblock(filename, treereader=readTopology, args...;
1377+
reticulate=true, stringmodifier=[r"#(\d+)" => s"#H\1"])
1378+
1379+
Read the *first* "trees" block of a nexus-formatted file, using the translate
1380+
table if present, and return a vector of `HybridNetwork`s.
1381+
Information inside `[&...]` are interpreted as comments and are discarded by the
1382+
default tree reader. Optional arguments `args` are passed to the tree reader.
1383+
1384+
For the nexus format, see
1385+
[Maddison, Swofford & Maddison (1997)](https://doi.org/10.1093/sysbio/46.4.590).
1386+
1387+
Unless `reticulate` is false, the following is done to read networks with reticulations.
1388+
1389+
Prior to reading each phylogeny, each instance of `#number` is replaced by
1390+
`#Hnumber` to fit the standard extended Newick format at hybrid nodes.
1391+
This behavior can be changed with option `stringmodifier`, which should be a
1392+
vector of pairs accepted by `replace`.
1393+
1394+
Inheritance γ values are assumed to be given within "comment" blocks at *minor*
1395+
hybrid edges (cut as tips to form the extended Newick) like this for example,
1396+
as output by bacter ([Vaughan et al. 2017](http://dx.doi.org/10.1534/genetics.116.193425)):
1397+
1398+
#11[&conv=0, relSize=0.08, ...
1399+
1400+
or like this, as output by SpeciesNetwork
1401+
([Zhang et al. 2018](https://doi.org/10.1093/molbev/msx307)):
1402+
1403+
#H11[&gamma=0.08]
1404+
1405+
In this example, the corresponding edge to hybrid H11 has γ=0.08.
1406+
"""
1407+
function readnexus_treeblock(file::AbstractString, treereader=readTopology::Function, args...;
1408+
reticulate=true, stringmodifier=[r"#(\d+)\b" => s"#H\1"]) # add H
1409+
vnet = HybridNetwork[]
1410+
rx_start = r"^\s*begin\s+trees\s*;"i
1411+
rx_end = r"^\s*end\s*;"i
1412+
rx_tree = r"^\s*tree\s+[^(]+(\([^;]*;)"i
1413+
treeblock = false
1414+
translate = false
1415+
id2name = nothing
1416+
open(file) do s
1417+
numl = 0
1418+
for line in eachline(s)
1419+
numl += 1
1420+
if treeblock
1421+
occursin(rx_end, line) && break # exit if end of tree block
1422+
elseif occursin(rx_start, line) # start reading tree block
1423+
treeblock = true
1424+
line, translate, id2name = readnexus_translatetable(s)
1425+
else continue
1426+
end
1427+
# now we are inside the treeblock
1428+
m = match(rx_tree, line)
1429+
isnothing(m) && continue
1430+
phy = m.captures[1]
1431+
if reticulate # fix #Hn and extract γ from #Hn[&conv=n, relSize=γ]
1432+
phy = replace(phy, stringmodifier...)
1433+
id2gamma = readnexus_extractgamma(phy)
1434+
end
1435+
net = nothing
1436+
try
1437+
net = treereader(phy, args...)
1438+
catch err
1439+
warnmsg = "skipped phylogeny on line $(numl) of file\n$file\n" *
1440+
(:msg in fieldnames(typeof(err)) ? err.msg : string(typeof(err)))
1441+
@warn warnmsg
1442+
continue # don't push to vnet
1443+
end
1444+
reticulate && readnexus_assigngammas!(net, id2gamma)
1445+
if translate
1446+
for tip in net.leaf
1447+
id = parse(Int, tip.name)
1448+
tip.name = id2name[id]
1449+
end
1450+
end
1451+
push!(vnet, net)
1452+
end
1453+
end
1454+
return vnet
1455+
end
1456+
1457+
"""
1458+
readnexus_translatetable(io)
1459+
1460+
Read translate table from IO object `io`, whose first non-empty line should contain
1461+
"translate". Then each line should have "number name" and the end of the table
1462+
is indicated by a ;. Output tuple:
1463+
- line that was last read, and is not part of the translate table, taken from `io`
1464+
- translate: boolean, whether a table was successfully read
1465+
- id2name: dictionary mapping number to name.
1466+
"""
1467+
function readnexus_translatetable(io)
1468+
rx_translate = r"^\s*translate"i
1469+
rx_emptyline = r"^\s*$"
1470+
line = readline(io)
1471+
translate = false
1472+
id2name = Dict{Int,String}()
1473+
while true
1474+
if occursin(rx_translate, line)
1475+
translate = true
1476+
break
1477+
elseif occursin(rx_emptyline, line)
1478+
line = readline(io)
1479+
else
1480+
translate = false
1481+
break
1482+
end
1483+
end
1484+
if translate # then read the table
1485+
rx_end = r"^\s*;"
1486+
rx_idname = r"\s*(\d+)\s+(\w+)\s*([,;]?)"
1487+
while true
1488+
line = readline(io)
1489+
occursin(rx_emptyline, line) && continue
1490+
if occursin(rx_end, line)
1491+
line = readline(io)
1492+
break
1493+
end
1494+
m = match(rx_idname, line)
1495+
if isnothing(m)
1496+
@warn "problem reading the translate table at line $line.\nnumbers won't be translated to names"
1497+
translate = false
1498+
break
1499+
end
1500+
push!(id2name, parse(Int,m.captures[1]) => String(m.captures[2]))
1501+
if m.captures[3] == ";"
1502+
line = readline(io)
1503+
break
1504+
end
1505+
end
1506+
end
1507+
return line, translate, id2name
1508+
end
1509+
1510+
"""
1511+
readnexus_extractgamma(nexus_string)
1512+
1513+
Extract γ from comments and return a dictionary hybrid number ID => γ, from
1514+
one single phylogeny given as a string.
1515+
The output from BEAST2 uses this format for reticulations at *minor* edges,
1516+
as output by bacter ([Vaughan et al. 2017](http://dx.doi.org/10.1534/genetics.116.193425)):
1517+
1518+
#11[&conv=0, relSize=0.08, ...
1519+
1520+
or as output by SpeciesNetwork ([Zhang et al. 2018](https://doi.org/10.1093/molbev/msx307)):
1521+
1522+
#H11[&gamma=0.08]
1523+
1524+
The function below assumes that the "H" was already added back if not present
1525+
already (from bacter), like this:
1526+
1527+
#H11[&conv=0, relSize=0.19, ...
1528+
1529+
The bacter format is tried first. If this format doesn't give any match,
1530+
then the SpeciesNetwork format is tried next.
1531+
See [`readnexus_assigngammas!`](@ref).
1532+
"""
1533+
function readnexus_extractgamma(nexstring)
1534+
rx_gamma_v1 = r"#H(\d+)\[&conv=\d+,\s*relSize=(\d+\.\d+)"
1535+
rx_gamma_v2 = r"#H(\d+)\[&gamma=(\d+\.\d+)"
1536+
id2gamma = Dict{Int,Float64}()
1537+
# first: try format v1
1538+
for m in eachmatch(rx_gamma_v1, nexstring)
1539+
push!(id2gamma, parse(Int, m.captures[1]) => parse(Float64,m.captures[2]))
1540+
end
1541+
if isempty(id2gamma) # then try format v2
1542+
for m in eachmatch(rx_gamma_v2, nexstring)
1543+
push!(id2gamma, parse(Int, m.captures[1]) => parse(Float64,m.captures[2]))
1544+
end
1545+
end
1546+
return id2gamma
1547+
end
1548+
1549+
"""
1550+
readnexus_assigngammas!(net, d::Dict)
1551+
1552+
Assign d[i] as the `.gamma` value of the minor parent edge of hybrid "Hi",
1553+
if this hybrid node name is found, and if its minor parent doesn't already
1554+
have a non-missing γ. See [`readnexus_extractgamma`](@ref)
1555+
"""
1556+
function readnexus_assigngammas!(net::HybridNetwork, id2gamma::Dict)
1557+
for (i,gam) in id2gamma
1558+
nam = "H$i"
1559+
j = findfirst(n -> n.name == nam, net.hybrid)
1560+
if isnothing(j)
1561+
@warn "didn't find any hybrid node named $nam."
1562+
continue
1563+
end
1564+
hn = net.hybrid[j]
1565+
he = getparentedgeminor(hn)
1566+
if he.gamma == -1.0
1567+
setGamma!(he, gam)
1568+
else
1569+
@warn "hybrid edge number $(he.number) has γ=$(he.gamma). won't erase with $gam."
1570+
end
1571+
end
1572+
return net
1573+
end
1574+
13741575
"""
13751576
writeMultiTopology(nets, file_name; append=false)
13761577
writeMultiTopology(nets, IO)

test/test_lm.jl

+3-3
Original file line numberDiff line numberDiff line change
@@ -387,9 +387,9 @@ phynetlm = phylolm(@formula(trait ~ pred), dfr, net; reml=false)
387387
@test bic(phynetlm) bic(fit_mat)
388388

389389
# Deprecated methods
390-
@test phynetlm.model === phynetlm
391-
@test phynetlm.mf.f == formula(phynetlm)
392-
@test phynetlm.mm.m == modelmatrix(phynetlm)
390+
@test (@test_logs (:warn,r"^accessing") phynetlm.model) === phynetlm
391+
@test (@test_logs (:warn,r"^accessing") phynetlm.mf.f) == formula(phynetlm)
392+
@test (@test_logs (:warn,r"^accessing") phynetlm.mm.m) == modelmatrix(phynetlm)
393393

394394
# unordered data
395395
dfr = dfr[[2,6,10,5,12,7,4,11,1,8,3,9], :]

test/test_readInputData.jl

+9-5
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,17 @@
2727

2828

2929
@testset "test: reading nexus file" begin
30+
# with translate table, hybrids, failed γ in net 2, bad net 3, v2 format in net 4
31+
nexusfile = joinpath(@__DIR__, "..", "examples", "test_reticulatetreeblock.nex")
32+
# nexusfile = joinpath(dirname(pathof(PhyloNetworks)), "..","examples","test_reticulatetreeblock.nex")
33+
vnet = (@test_logs (:warn, r"^hybrid edge") (:warn,r"^skipped") readnexus_treeblock(nexusfile));
34+
@test length(vnet) == 3
35+
@test writeTopology(vnet[1]) == "((tax4,(tax3,#H7:0.001::0.08):0.3):0.6,(tax2,(tax1:0.1)#H7:0.9::0.92):10.0);"
36+
@test writeTopology(vnet[3]) == "((tax1:1.13,((tax2:0.21)#H1:0.89::0.72,(tax3:1.03,(#H1:0.3::0.28,tax4:0.51)S3:0.51)S4:0.08)S5:0.2):0.6,tax5:1.14);"
37+
# example without translate table and without reticulations
3038
nexusfile = joinpath(@__DIR__, "..", "examples", "test.nex")
3139
# nexusfile = joinpath(dirname(pathof(PhyloNetworks)), "..","examples","test.nex")
32-
vnet = readNexusTrees(nexusfile);
33-
@test length(vnet) == 10
34-
@test length(vnet[10].edge) == 10
35-
@test vnet[10].edge[7].length 0.00035
36-
vnet = readNexusTrees(nexusfile, PhyloNetworks.readTopologyUpdate, false, false);
40+
vnet = readnexus_treeblock(nexusfile, PhyloNetworks.readTopologyUpdate, false, false; reticulate=false);
3741
@test length(vnet) == 10
3842
@test length(vnet[10].edge) == 9
3943
@test vnet[10].edge[7].length 0.00035

0 commit comments

Comments
 (0)