The data this week comes from San Francisco's open data portal.
There are dozens of tree species, and many other intresting features to explore in this dataset! I did drop a few columns that were either > 75% missing or redundant, feel free to check out the source for the fully original dataset.
Also - make sure to follow @tidypod - they'll have some interesting #TidyTuesday
updates to come this week!
Some interesting articles:
# Get the Data
sf_trees <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-01-28/sf_trees.csv')
# Or read in with tidytuesdayR package (https://github.com/dslc-io/tidytuesdayR)
# PLEASE NOTE TO USE 2020 DATA YOU NEED TO UPDATE tidytuesdayR from GitHub
# Either ISO-8601 date or year/week works!
# Install via pak::pak("dslc-io/tidytuesdayR")
tuesdata <- tidytuesdayR::tt_load('2020-01-28')
tuesdata <- tidytuesdayR::tt_load(2020, week = 5)
sf_trees <- tuesdata$sf_trees
A full data dictionary is available at: the source but it's fairly sparse.
variable | class | description |
---|---|---|
tree_id | double | Unique ID |
legal_status | character | LegalLegal staus: Permitted or DPW maintained |
species | character | Tree species includes common name after the :: separator |
address | character | Street Address |
site_order | double | Order of tree at address where multiple trees are at same address. Trees are ordered in ascending |
address order | ||
site_info | character | Site Info - Where the tree resides |
caretaker | character | Agency or person that is primary caregiver to tree -- Owner of Tree |
date | double | Date Planted (NA if before 1955) |
dbh | double | Diameter at breast height |
plot_size | character | Dimension of plot - typically in feet |
latitude | double | Latitude |
longitude | double | Longitude |
library(tidyverse)
library(here)
library(tidytuesdaymeta)
library(pryr)
library(visdat)
library(skimr)
library(lubridate)
library(leaflet)
create_tidytuesday_folder()
raw_df <- read_csv(here::here("2020", "2020-01-28", "Street_Tree_Map.csv"),
col_types =
cols(
TreeID = col_double(),
qLegalStatus = col_character(),
qSpecies = col_character(),
qAddress = col_character(),
SiteOrder = col_double(),
qSiteInfo = col_character(),
PlantType = col_character(),
qCaretaker = col_character(),
qCareAssistant = col_character(),
PlantDate = col_character(),
DBH = col_double(),
PlotSize = col_character(),
PermitNotes = col_character(),
XCoord = col_double(),
YCoord = col_double(),
Latitude = col_double(),
Longitude = col_double(),
Location = col_character()
)) %>%
janitor::clean_names()
small_df <- raw_df %>%
select(-x_coord,-y_coord,-q_care_assistant, -permit_notes) %>%
filter(plant_type != "Landscaping") %>%
select(-plant_type) %>%
separate(plant_date, into = c("date", "time"), sep = " ") %>%
mutate(date = parse_date(date, "%m/%d/%Y")) %>%
select(-time, -location) %>%
arrange(date) %>%
rename(legal_status = q_legal_status,
species = q_species,
address = q_address,
site_info = q_site_info,
caretaker = q_caretaker)
small_df %>% skimr::skim()
small_df %>%
write_csv(here::here("2020", "2020-01-28", "sf_trees.csv"))