@@ -33,8 +33,8 @@ in the example below, this file is best read later with the option
33
33
```julia
34
34
mapAllelesCFtable("allele-species-map.csv", "allele-quartet-CF.csv";
35
35
filename = "quartetCF_speciesNames.csv")
36
- df_sp = DataFrame( CSV.File ("quartetCF_speciesNames.csv"); copycols=false ); # DataFrame object
37
- dataCF_specieslevel = readTableCF!(df_sp); # DataCF object
36
+ df_sp = CSV.read ("quartetCF_speciesNames.csv", DataFrame ); # DataFrame object
37
+ dataCF_specieslevel = readTableCF!(df_sp, mergerows=true ); # DataCF object
38
38
```
39
39
"""
40
40
function mapAllelesCFtable (alleleDF:: AbstractString , cfDF:: AbstractString ;
@@ -65,12 +65,12 @@ as its first argument.
65
65
function mapAllelesCFtable! (cfDF:: DataFrame , alleleDF:: DataFrame , co:: Vector{Int} ,write:: Bool ,filename:: AbstractString )
66
66
size (cfDF,2 ) >= 7 || error (" CF DataFrame should have 7+ columns: 4taxa, 3CF, and possibly ngenes" )
67
67
if length (co)== 0 co= [1 ,2 ,3 ,4 ]; end
68
- compareTaxaNames (alleleDF,cfDF,co)
68
+ allelecol, speciescol = compareTaxaNames (alleleDF,cfDF,co)
69
69
for j in 1 : 4
70
70
for ia in 1 : size (alleleDF,1 ) # for all alleles
71
71
cfDF[! ,co[j]] = map (x-> replace (string (x),
72
- Regex (" ^$(string (alleleDF[ia,:allele ])) \$ " ) =>
73
- alleleDF[ia,:species ]),
72
+ Regex (" ^$(string (alleleDF[ia,allelecol ])) \$ " ) =>
73
+ alleleDF[ia,speciescol ]),
74
74
cfDF[! ,co[j]])
75
75
end
76
76
end
85
85
# inside readTableCF!
86
86
# by deleting rows that are not informative like sp1 sp1 sp1 sp2
87
87
# keepOne=true: we only keep one allele per species
88
- function cleanAlleleDF! (newdf:: DataFrame , cols:: Vector{Int} ;keepOne= false :: Bool )
89
- withngenes = (length (cols)== 8 )
88
+ function cleanAlleleDF! (newdf:: DataFrame , cols:: Vector{<:Integer} ; keepOne= false :: Bool )
90
89
delrows = Int[] # indices of rows to delete
91
- repSpecies = String[]
90
+ repSpecies = Set { String} ()
92
91
if (isa (newdf[1 ,cols[1 ]],Integer)) # taxon names as integers: we need this to be able to add __2
93
- newdf[! ,cols[1 ]] = map (string, newdf[! ,cols[1 ]])
94
- newdf[! ,cols[2 ]] = map (string, newdf[! ,cols[2 ]])
95
- newdf[! ,cols[3 ]] = map (string, newdf[! ,cols[3 ]])
96
- newdf[! ,cols[4 ]] = map (string, newdf[! ,cols[4 ]])
92
+ for j in 1 : 4
93
+ newdf[! ,cols[j]] .= map (string, newdf[! ,cols[j]])
94
+ end
97
95
end
98
96
row = Vector {String} (undef, 4 )
99
- for i in 1 : size (newdf,1 ) # check all rows
100
- @debug " row number: $i "
101
- # fixit: check for no missing value, or error below
97
+ for i in 1 : nrow (newdf)
102
98
map! (j -> newdf[i,cols[j]], row, 1 : 4 )
103
- @debug " row $(row) "
104
99
uniq = unique (row)
105
- @debug " unique $(uniq) "
106
100
107
- keep = false # default: used if 1 unique name, or 2 in some cases
108
101
if (length (uniq) == 4 )
109
- keep = true
110
- else
111
- if (! keepOne)
112
- if (length (uniq) == 3 ) # sp1 sp1 sp2 sp3
102
+ continue
103
+ end
104
+ # by now, at least 1 species is repeated
105
+ if ! keepOne # then we may choose to keep this row
106
+ # 3 options: sp1 sp1 sp2 sp3; or sp1 sp1 sp2 sp2 (keep)
107
+ # or sp1 sp1 sp1 sp2; or sp1 sp1 sp1 sp1 (do not keep)
108
+ keep = false
109
+ for u in uniq
110
+ ind = row .== u # indices of taxon names matching u
111
+ if sum (ind) == 2
113
112
keep = true
114
- for u in uniq
115
- @debug " u $(u) , typeof $(typeof (u)) "
116
- ind = row .== u # taxon names matching u
117
- @debug " taxon names matching u $(ind) "
118
- if (sum (ind) == 2 )
119
- push! (repSpecies,string (u))
120
- found = false
121
- for k in 1 : 4
122
- if (ind[k])
123
- if (found)
124
- @debug " found the second one in k $(k) , will change newdf[i,cols[k]] $(newdf[i,cols[k]]) , typeof $(typeof (newdf[i,cols[k]])) "
125
- newdf[i,cols[k]] = string (u, repeatAlleleSuffix)
126
- break
127
- else
128
- found = true
129
- end
130
- end
131
- end
132
- break
133
- end
134
- end
135
- elseif (length (uniq) == 2 )
136
- # keep was initialized to false
137
- for u in uniq
138
- @debug " length uniq is 2, u $(u) "
139
- ind = row .== u
140
- if (sum (ind) == 1 || sum (ind) == 3 )
141
- @debug " ind $(ind) is 1 or 3, should not keep"
142
- break
143
- elseif (sum (ind) == 2 )
144
- @debug " ind $(ind) is 2, should keep"
145
- keep = true
146
- found = false
147
- push! (repSpecies,string (u))
148
- for k in 1 : 4
149
- if (ind[k])
150
- if (found)
151
- newdf[i,cols[k]] = string (u, repeatAlleleSuffix)
152
- break
153
- else
154
- found = true
155
- end
156
- end
157
- end
158
- end
159
- end
113
+ push! (repSpecies, string (u))
114
+ # change the second instance of a repeated taxon name with suffix
115
+ k = findlast (ind)
116
+ newdf[i,cols[k]] = string (u, repeatAlleleSuffix)
160
117
end
161
- @debug " after if, keep is $(keep) "
162
118
end
163
119
end
164
120
keep || push! (delrows, i)
165
- @debug " " keep
166
121
end
167
- @debug " " delrows
168
- @debug " " repSpecies
169
122
nrows = size (newdf,1 )
170
123
nkeep = nrows - length (delrows)
171
124
if nkeep < nrows
172
125
print (""" found $(length (delrows)) 4-taxon sets uninformative about between-species relationships, out of $(nrows) .
173
126
These 4-taxon sets will be deleted from the data frame. $nkeep informative 4-taxon sets will be used.
174
127
""" )
175
128
nkeep > 0 || @warn " All 4-taxon subsets are uninformative, so the dataframe will be left empty"
176
- deleterows ! (newdf, delrows)
129
+ deleteat ! (newdf, delrows) # deleteat! requires DataFrames 1.3
177
130
end
178
- # @show size(newdf)
179
- return unique (repSpecies)
131
+ return collect (repSpecies)
180
132
end
181
133
182
134
240
192
# function to compare the taxon names in the allele-species matching table
241
193
# and the CF table
242
194
function compareTaxaNames (alleleDF:: DataFrame , cfDF:: DataFrame , co:: Vector{Int} )
243
- checkMapDF (alleleDF)
244
- # println("found $(length(alleleDF[1])) allele-species matches")
195
+ allelecol, speciescol = checkMapDF (alleleDF)
245
196
CFtaxa = string .(mapreduce (x -> unique (skipmissing (x)), union, eachcol (cfDF[! ,co[1 : 4 ]])))
246
- alleleTaxa = map (string, alleleDF[! ,:allele ]) # as string, too
197
+ alleleTaxa = map (string, alleleDF[! ,allelecol ]) # as string, too
247
198
sizeCF = length (CFtaxa)
248
199
sizeAllele = length (alleleTaxa)
249
200
if sizeAllele > sizeCF
@@ -260,14 +211,26 @@ function compareTaxaNames(alleleDF::DataFrame, cfDF::DataFrame, co::Vector{Int})
260
211
for n in unchanged warnmsg *= " $n " ; end
261
212
@warn warnmsg
262
213
end
263
- return nothing
214
+ return allelecol, speciescol
264
215
end
265
216
266
- # function to check that the allele df has one column labelled alleles and one column labelled species
217
+ """
218
+ checkMapDF(mapping_allele2species::DataFrame)
219
+
220
+ Check that the data frame has one column named "allele" or "individual",
221
+ and one column named "species". Output: indices of these column.
222
+ """
267
223
function checkMapDF (alleleDF:: DataFrame )
268
224
size (alleleDF,2 ) >= 2 || error (" Allele-Species matching Dataframe should have at least 2 columns" )
269
- :allele in DataFrames. propertynames (alleleDF) || error (" In allele mapping file there is no column named allele" )
270
- :species in DataFrames. propertynames (alleleDF) || error (" In allele mapping file there is no column named species" )
225
+ colnames = DataFrames. propertynames (alleleDF)
226
+ allelecol = findfirst (x -> x == :allele , colnames)
227
+ if isnothing (allelecol)
228
+ allelecol = findfirst (x -> x == :individual , colnames)
229
+ end
230
+ isnothing (allelecol) && error (" In allele mapping file there is no column named 'allele' or 'individual'" )
231
+ speciescol = findfirst (x -> x == :species , colnames)
232
+ isnothing (speciescol) && error (" In allele mapping file there is no column named species" )
233
+ return allelecol, speciescol
271
234
end
272
235
273
236
0 commit comments