Week2_Data_Manipulation/data_manipulation_demo.R at master · UW-RSeminar-Fall2015/Week2_Data_Manipulation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
## Directory info
getwd() # To change, use setwd()
list.files()
list.files(all.files=T)

## Import birds
read.csv("birds.csv",header=T,stringsAsFactors=F,na.strings="") -> birds
head(birds)
names(birds)

## Import everett
read.csv("everett.csv",header=T,stringsAsFactors=F) -> everett
head(everett)
names(everett)

## Data frame queries
nrow(birds)
ncol(birds)
dim(birds)

## Data classes & coersion
str(birds)
class(birds)
class(birds$id)
as.character(birds$id) -> birds$id
class(birds$id)
as.numeric(birds$id) -> birds$id
class(birds$id)
birds$species
as.factor(birds$species) -> f
f
levels(f)
unique(birds$species)


## Access vectors/values from data frames using indices
birds$species
birds[,"species"]
birds[,2] # Second column; this format useful for automation
birds[1,] # First row
birds[1,"duration"] # 1st row, duration value
birds[c(1,3),"duration"] # duration from 1st & 3rd rows
4 -> birds[c(1,3),"duration"]
birds[c(1,3),"duration"]

## Other vector functions worth mentioning
"NOWA" %in% birds$species
birds$species %in% "NOWA" #"NOWA" is a character vector length 1
union(1:10,5:15)

## Subsetting
subset(birds,birds$species=="NOWA") -> a
str(a)
which(birds$species=="NOWA") # Vector of indices when condition true
which(birds$species!="NOWA") # Vector of indices when condition false
which(!birds$species=="NOWA") # Negate any logical vector with "!"
!TRUE
birds[which(birds$species=="NOWA"),] -> a  ## Alternative to subset()
str(a)

## Lists & list indexing
list(1:5,letters[1:5],month.name[1:5],
     matrix(data=1:4,2,2,byrow=T),list(1:3,2:4,3:5)) -> a
str(a)
a
a[[2]]
a[[2]][1]
a[[4]]
a[[4]][1,2]
a[[5]]
a[[5]][[2]]
a[[5]][[2]][2]

## For loop example:  Number of species per point in Everett data
head(everett)
numeric(0) -> p # Numeric vector of length zero (empty)
p
for (i in 1:50){
  subset(everett,everett$point==i) -> a
  append(p,length(unique(a$species))) -> p
}
p

## sapply example
split(everett,everett$point) -> a
class(a)
head(a,2)
a[[40]]
sapply(a,function(x){length(x$species)}) -> b
b
str(b)
unname(b)

## vapply equivalent (specifies the data structure of function return)
vapply(a,function(x){length(x$species)},FUN.VALUE=1) -> b
unname(b)

## lapply equivalent (returns list of same length as a)
lapply(a,function(x){length(x$species)}) -> b
str(b)
b
unname(unlist(b))

## apply - applies function across margins of matrix or array
matrix(1:81,9) -> m
m
apply(m,1,sum) #rows
apply(m,2,sum) #columns
rowSums(m) #Built-in equivalent (sums and means only)
colMeans(m)
## Apply functions can be nested
apply(m,2,function(x){      #each x is a matrix column
  sapply(x,function(y){       #each y is element of column x
    if (y %% 2 == 0) {y} else {y*2}  #Multiply by 2 if odd
  }) # end of sapply
}) # end of apply

matrix(rep_len(c(2,1),81) * 1:81 ,9)  #simpler equivalent

## mapply - vectorizes across multiple arguments
mapply(rep, 1:4, 4:1) #specifying arguments by default order
mapply(rep,times=1:4,x=4:1) #specifying arguments directly

## Sapply example: Total count for each species
split(everett,everett$species) -> a
a[[1]]
sapply(a,function(x){sum(x$count)}) -> b
b
#write.csv(b,"species_totals.csv")

## More complex sapply example: Return 3 calculations per species.
split(everett,everett$species) -> a
a[[1]]
sapply(a,function(x){
  sum(x$count) -> total   # Total number of individuals counted
  length(x$point) -> npoints # Number of points where encountered
  mean(x$cars) -> meancars # Mean no. of cars on points where encountered
  return(c("total"=total,"npoints"=npoints,"meancars"=meancars))
}) -> b
str(b)  # It's a matrix
head(b) # It has species as columns, 3 quantities as rows
as.data.frame(t(b)) -> b  # Convert to data frame after transpose
str(b)
head(b)

####################### PLYR Package
#install.packages(plyr)
library(plyr)
## plyr package
##     written by Hadley Wickham (), along with ggplot2, dplyr
## Takes a given data structure,
##       splits it into groups, applies some summary or function to each group of data,
##       combines back to a data structure
## transform vs summarize
##        transform adds the computation or function back to the original data
##        summarize creates a new data structure containing the summarized (group) results

## Most Basic ddply: using a database, split, apply, combine and output a dataframe
## general syntax: ddply(data.frame, variable(s), function, optional arguments)

## Going back to the for loop example:  Number of species per stop in Everett data
head(everett)
str(everett)
numeric(0) -> p # Numeric vector of length zero (empty)
for (i in 1:50){
	subset(everett,everett$point==i) -> a
	append(p,length(unique(a$species))) -> p
}
num.spp.loop <- cbind(1:50, p)
dimnames(num.spp.loop)[[2]] <- c('point','num_spp')
head(num.spp.loop)
## using ddply
num.spp.plyr <- ddply(everett, .(point), summarize,
		num_spp = length(species))
head(num.spp.plyr)

## sapply example: Total of each species
## another sapply example
split(everett,everett$species) -> a
a[[1]]
sapply(a,function(x){sum(x$count)}) -> b

str(b)
head(b)
## using plyr
totl.spp.plyr <- ddply(everett, .(species), summarize,
		totl_spp_count = sum(count))
head(totl.spp.plyr)
## the more complex example
split(everett,everett$species) -> a
a[[1]]
sapply(a,function(x){
			sum(x$count) -> total
			length(x$point) -> npoints
			mean(x$cars) -> meancars
			return(c("total"=total,"npoints"=npoints,"meancars"=meancars))
		}) -> b
str(b)
head(b)
as.data.frame(t(b)) -> b
str(b)
head(b)
## using plyr
##    note can string as many functions together as needed
summary.spp.plyr <- ddply(everett, .(species), summarize,
		total = sum(count), #total number of each species, can omit any NAs with sum(count, na.rm=TRUE)
		npoints = length(point), # number of points where species was encountered
		meancars = mean(cars) # mean number of cars, can omit any NAs with mean(cars, na.rm=TRUE)
)
head(summary.spp.plyr)

####################################################
## try some other structures and more complex data
#options(width = 180)
names(birds)
head(birds)
## number of birds by species
length(unique(birds$species)) # number of different species
length(unique(birds$id)) # number of individual birds
table(birds$species, useNA='ifany')
table(birds$species, birds$sex, useNA='ifany')

ddply (birds, .(species, sex), summarize,
		num_birds = length(id),
		mean_fat = mean(fat),
		var_fat = var(fat)
		)

bird.summry.dat <- ddply (birds, .(species), summarize,
    	num_birds_all = length(id),
		mean_fat_all = round(mean(fat),2),
		var_fat_all = round(var(fat),4),

		num_birds_F = length(id[which(sex == 'F')]),
	    mean_fat_F = round(mean(fat[which(sex == 'F')]),2),
		var_fat_F = round(var(fat[which(sex == 'F')]),4)
		)

bird.summry.dat$pct_Female <- bird.summry.dat$num_birds_F / bird.summry.dat$num_birds_all
bird.summry.dat

## transform example
##    add a new variable to existing data frame: pct_fat = fat/mass
dim(birds)
head(birds)
birds <- ddply (birds, .(species, sex), transform,
		pct_fat = round(fat/mass,2)
)
dim(birds)
head(birds)
## get the original data file back
read.csv("birds.csv",header=T,stringsAsFactors=F,na.strings="") -> birds
head(birds)
names(birds)
## or create a new data frame ~ note it has the same number of records as original
birds.plus <- ddply (birds, .(species, sex), transform,
		pct_fat = round(fat/mass,2)
)
dim(birds.plus)
head(birds.plus)
tail(birds.plus)
birds.plus[c(1:3,16:18,31:33,46:49,64:66),]

##  add both a new variable (pct_mass) and summary variables number of birds of each gender and mean fat of birds of each gender
head(birds)
ddply (birds, .(species, sex), summarize,
		num_birds = length(id),
		mean_fat = mean(fat),
		var_fat = var(fat)
) ## the group summaries for reference

birds.plus <- ddply (birds, .(species, sex), transform,
		pct_fat = round(fat/mass,2),
		num_birds_spp_sex = length(id),
		mean_fat_spp_sex = round(mean(fat),2)
)
dim(birds.plus)
birds.plus[c(1:3,16:18,31:33,46:49,64:66),]
head(birds.plus)
tail(birds.plus)

## Lastly using other data structures
##    input a dataframe, for each species determine the number of birds, number of each species, the mean and var of fat, output as a list
bird.list <- dlply(birds, .(species), summarize,
		num_birds = length(id),
		num_F = length(id[which(species=='F')]),
        num_M = length(id[which(species=='M')]),
		mean_fat = mean(fat),
		var_fat = var(fat)
		)
str(bird.list)
bird.list