Interview Areas.Rmd

---
title: "Interview Areas"
author: "Malcolm Morgan"
date: "3 July 2019"
output: github_document
always_allow_html: yes
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## Setup

```{r, include=FALSE}
secure_path <- "E:/OneDrive - University of Leeds/Data/CREDS Data/Tim Share"

```

## Input data

First load the data, some in England and Wales only so we will start with just there.

```{r, include=FALSE}
library(sf)
library(dplyr)
library(Hmisc)
library(tmap)
tmap_mode("view")
bounds <- st_read("data-prepared/LSOA_generalised.gpkg")
elec <- read.csv("data-prepared/Electricty_2010-17.csv", stringsAsFactors = FALSE)
elec <- elec[!duplicated(elec$LSOA11),] 
gas <- read.csv("data-prepared/Gas_2010-17.csv", stringsAsFactors = FALSE)
mot <- read.csv(paste0(secure_path,"/From Tim/MOT Data RACv9.3/MOT Data RACv9.3 LSOAoutputs_2011.csv"), stringsAsFactors = FALSE)
age <- readRDS("data-prepared/age.Rds") # Not full UK
census <- readRDS("data-prepared/census_lsoa.Rds") # Not full UK
heating <- readRDS("data-prepared/central_heating.Rds") # Not full UK
EPC <- readRDS("data-prepared/EPC.Rds")
population <- readRDS("data-prepared/population.Rds") # Not full UK
density <- readRDS("data-prepared/populationDensity.Rds")
rooms <- readRDS("data-prepared/rooms.Rds")
ru <- readRDS("data-prepared/ruralurban.Rds")
access_town <- readRDS("data-prepared/access_town.Rds")
access_town <- access_town[,c("LSOA11","acc_town_p15min_PT_walk")]
clasf <- readRDS("data-prepared/area_classification.Rds")

dir.create("temp")
unzip("../Secure-Data/Excess/XSExpData1.zip", exdir = "temp")
tim <- read.csv("temp/XSExpData1.csv", stringsAsFactors = FALSE)
unlink("temp", recursive = T)

dir.create("temp")
unzip("../Secure-Data/Excess/Experian.zip", exdir = "temp")
experian <- read.csv("temp/UKDA-5738-csv/csv/2011-experian-data.csv", stringsAsFactors = FALSE)
unlink("temp", recursive = T)
names(experian) <- experian[4,]
experian <- experian[5:nrow(experian),]
experian <- experian[,c("GeographyValue","Median_(H) Household Income Value")]
names(experian) <- c("LSOA","median_household_income")

```

We will subsets the data and join together

```{r, include=FALSE}
bounds <- bounds[bounds$LSOA11 %in% census$CODE,]
elec <- elec[elec$LSOA11 %in% bounds$LSOA11, c("LSOA11","DomMet_11",
                                        "DomMet_17","MeanDomElec_11_kWh",
                                        "MeanDomElec_17_kWh",
                                        "TotDomElec_11_kWh","TotDomElec_17_kWh")]
gas <- gas[gas$LSOA11 %in% bounds$LSOA11, c("LSOA11","GasMet_11", "GasMet_17",
                                            "MeanDomGas_11_kWh","MeanDomGas_17_kWh",
                                            "TotDomGas_11_kWh", "TotDomGas_17_kWh")]
mot <- mot[mot$LSOA %in% bounds$LSOA11,]
names(mot) <- c("LSOA", "cars_total","cars_miles","pu5k","p5_12k","po12k","age_av","miles_av_u3",
                     "miles_av_o13","pcars_diesel","pmiles_diesel","vans_total","vans_miles",
                     "pmiles_car","pmiles_vans","cars_percap","miles_percap")
age <- age[age$lsoa %in% bounds$LSOA11, c("lsoa","BP_PRE_1900","BP_1900_1918","BP_1919_1929",
                                         "BP_1930_1939","BP_1945_1954","BP_1955_1964",
                                         "BP_1965_1972","BP_1973_1982","BP_1983_1992",
                                         "BP_1993_1999","BP_2000_2009","BP_2010_2015",
                                         "BP_UNKNOWN","ALL_PROPERTIES")]
census <- census[census$CODE %in% bounds$LSOA11,]
heating <- heating[heating$LSOA11 %in% bounds$LSOA11,]
EPC <- EPC[EPC$LSOA11 %in% bounds$LSOA11, c("LSOA11","Dwellings","Crr_EE","Ptn_EE")]
population <- population[population$LSOA11 %in% bounds$LSOA11,]
population$pop2011 <- as.numeric(population$pop2011)
population$pop2016 <- as.numeric(population$pop2016)
density$dense_2017 <- as.numeric(density$dense_2017)
density <- density[density$LSOA11 %in% bounds$LSOA11, ]


# Some values are more meaningful as percentages or averages
age$mean_house_age <- sapply(1:nrow(age), function(i){
  sub <- age[i,2:13]
  sub <- as.numeric(sub)
  res <- weighted.mean(x = c(1850,1909,1924,1934.5,1949.5,1959.5,1968.5,
                             1977.5,1987.5,1996,2004.5,2012.5) ,w = sub)
  return(res)
})

age$pP1900  <- round(age$BP_PRE_1900 / age$ALL_PROPERTIES * 100,2)
age$p1900_18 <- round(age$BP_1900_1918 / age$ALL_PROPERTIES * 100,2)
age$p1919_29 <- round(age$BP_1919_1929 / age$ALL_PROPERTIES * 100,2)
age$p1930_39 <- round(age$BP_1930_1939 / age$ALL_PROPERTIES * 100,2)
age$p1945_54 <- round(age$BP_1945_1954 / age$ALL_PROPERTIES * 100,2)
age$p1955_64 <- round(age$BP_1955_1964 / age$ALL_PROPERTIES * 100,2)
age$p1965_72 <- round(age$BP_1965_1972 / age$ALL_PROPERTIES * 100,2)
age$p1973_82 <- round(age$BP_1973_1982 / age$ALL_PROPERTIES * 100,2)
age$p1983_92 <- round(age$BP_1983_1992 / age$ALL_PROPERTIES * 100,2)
age$p1993_99 <- round(age$BP_1993_1999 / age$ALL_PROPERTIES * 100,2)
age$p2000_09 <- round(age$BP_2000_2009 / age$ALL_PROPERTIES * 100,2)
age$p2010_15 <- round(age$BP_2010_2015 / age$ALL_PROPERTIES * 100,2)
age$pUNKNOWN   <- round(age$BP_UNKNOWN / age$ALL_PROPERTIES * 100,2)

age <- age[,c("lsoa","mean_house_age","pP1900","p1900_18","p1919_29",
              "p1930_39","p1945_54","p1955_64","p1965_72","p1973_82",
              "p1983_92","p1993_99","p2000_09","p2010_15","pUNKNOWN")]

heating$pHeating_None <- round(heating$`No CH` / heating$All * 100,2)
heating$pHeating_Gas <- round(heating$Gas / heating$All * 100,2)
heating$pHeating_Electric <- round(heating$Electric / heating$All * 100,2)
heating$pHeating_Other <- round((heating$Oil + heating$`Solid fuel` + heating$Other)/ heating$All * 100,2)

heating <- heating[,c("LSOA11","pHeating_None","pHeating_Gas","pHeating_Electric","pHeating_Other")]

clasf$area_group <- as.factor(paste0(clasf$`Group Code`," ",clasf$`Group Name`))
clasf$area_supergroup <- as.factor(paste0(clasf$`Supergroup Code`," ",clasf$`Supergroup Name`))
clasf <- clasf[,c("SOA Code","area_group","area_supergroup")]


# Join Togther
all <- left_join(elec, gas, by = "LSOA11")
all <- left_join(all, mot, by = c("LSOA11" = "LSOA"))
all <- left_join(all, age, by = c("LSOA11" = "lsoa"))
all <- left_join(all, census, by = c("LSOA11" = "CODE"))
all <- left_join(all, heating, by = c("LSOA11" = "LSOA11"))
all <- left_join(all, EPC, by = c("LSOA11" = "LSOA11"))
all <- left_join(all, population, by = c("LSOA11" = "LSOA11"))
all <- left_join(all, density, by = c("LSOA11" = "LSOA11"))
all <- left_join(all, experian, by = c("LSOA11" = "LSOA"))
all <- left_join(all, rooms, by = c("LSOA11" = "geography.code"))
all <- left_join(all, ru, by = "LSOA11")
all <- left_join(all, access_town, by = c("LSOA11" = "LSOA11"))
all <- left_join(all, clasf, by = c("LSOA11" = "SOA Code"))
rm(age, census, density, elec, EPC, gas, heating, mot, population, experian, rooms, ru, access_town)
```

## Step 1: Top 30% of Gas, Electricity, or Driving

```{r, include=FALSE}
gas_decice <- quantile(all$MeanDomGas_17_kWh, seq(0, 1, 0.1), na.rm = TRUE)
elec_decice <- quantile(all$MeanDomElec_17_kWh, seq(0, 1, 0.1), na.rm = TRUE)
drive_decice <- quantile(all$miles_percap, seq(0, 1, 0.1), na.rm = TRUE)
epc_decile <- quantile(all$Crr_EE, seq(0, 1, 0.1), na.rm = TRUE)
```

### Gas
Deciles for mean gas consuption per capita (kWh)

```{r, echo=FALSE}
gas_decice 
```

### Electric
Deciles for mean electric consuption per capita (kWh)
```{r, echo=FALSE}
elec_decice 
```

### Driving
Decile for miles driven in cars per capita
```{r, echo=FALSE}
drive_decice 
```

No filter areas that are in top 30% for at least one category

```{r, echo=FALSE}
all$gas_centile <- ntile(all$MeanDomGas_17_kWh, 100)
all$elec_centile <- ntile(all$MeanDomElec_17_kWh, 100)
all$drive_centile <- ntile(all$miles_percap, 100)

all_sub <- all[!is.na(all$gas_centile),]
all_sub <- all_sub[!is.na(all_sub$elec_centile),]
all_sub <- all_sub[all_sub$elec_centile > 69 & all_sub$gas_centile > 69 & all_sub$drive_centile > 69, ]

all_sub <- all_sub[,c("LSOA11","area_supergroup","area_group",
                      "gas_centile","elec_centile","drive_centile", 
                      "median_household_income","mean_household.size",
                      "RUC11","acc_town_p15min_PT_walk","Crr_EE")]
```

Just under 50% of LSOAs are in the top 30% for at least one variable.

## Step 2: Rural Urban
Rural Urban Classifications:

```{r, echo=FALSE}
summary(all$RUC11)
```


```{r, echo=FALSE}
all_sub <- all_sub[all_sub$RUC11 %in% c("Urban city and town",
                                        "Urban city and town in a sparse setting",
                                        "Urban major conurbation",
                                        "Urban minor conurbation"
                                        ), ]
```
Filter out rural areas, now down to 33% of LSOAs

## Step 3: Accessibility Indicators

Using the DFT accessibility statistics to filter out areas where less than 50% of people are within 15 minutes of the town centre by walking or public transport.

```{r, echo=FALSE}
all_sub <- all_sub[!is.na(all_sub$acc_town_p15min_PT_walk),]
all_sub <- all_sub[all_sub$acc_town_p15min_PT_walk > 50, ]
```

Now down to 20 % of LSOAs

## Step 4: Building Energy Efficiency

Filter out areas with below average EPC ratings. The current EPC deciles are:

```{r, echo=FALSE}
epc_decile
```

```{r, echo=FALSE}
all_sub <- all_sub[all_sub$Crr_EE > epc_decile[6] , ]

# sort
all_sub <- arrange(all_sub, 
               desc(drive_centile),
               desc(gas_centile),
               desc(elec_centile))

all_sub <- left_join(all_sub, bounds, by = c("LSOA11"))
all_sub <- st_as_sf(all_sub)
st_crs(all_sub) <- 27700

postcodes <- readRDS("../../ITSleeds/OpenPostcodes/data-output/code_point_open.Rds")
postcodes_sub <- postcodes[all_sub,]
postcodes_inter <- st_intersects(all_sub, postcodes_sub)
postcodes_sub <- st_drop_geometry(postcodes_sub)
postcodes_match <- lapply(postcodes_inter, function(x){
  paste(postcodes_sub$postcode[x], collapse = ", ")
})
all_sub$postcodes <- unlist(postcodes_match)

write.csv(st_drop_geometry(all_sub), "data/interview_areas_top_30p_postcodes.csv", row.names = FALSE)

```

Down to 7% of LSOAs, so let's look at where we are.

## Summary of areas.
A quick map, we have quite a good geographic spread.
See an interactive map at  https://www.wisemover.co.uk/creds/
```{r, echo=FALSE}
postcodes_sub <- postcodes[all_sub,]

tm_shape(all_sub) +
  tm_polygons(col = "drive_centile") +
  tm_shape(postcodes_sub) +
  tm_dots()


```

## Narrow Down

7% of LSOAs is still a lot so rebuild with tighter critria by requiering areas to be higher energy useage. E.g. to be in top 20% energy use for one of the three categories drops to 5% of LSOA, top 10% drops to 2% of LSOAs.

Or we could look for areas that a high in all categories. Only 556 LSOAs are in the top 30% for all 3 energy types.

```{r, echo=FALSE}
tight  <- all_sub[all_sub$MeanDomElec_11_kWh > elec_decice[8] & all_sub$MeanDomGas_11_kWh > gas_decice[8] & all_sub$miles_percap > drive_decice[8], ]
tight <- tight[!is.na(tight$LSOA11),]

tm_shape(tight) +
  tm_polygons(col = "area_supergroup",
              popup.vars = c("MeanDomElec_11_kWh", 
                             "MeanDomGas_11_kWh", 
                             "cars_percap", 
                             "miles_percap",
                             "T2W_Car.",
                             "pHeating_Gas",
                             "pHeating_Electric",
                             "pHeating_Other",
                             "median_household_income",
                             "mean_household.size",
                             "mean_rooms",
                             "area_supergroup",
                             "area_group"
                             ))


write.csv(st_drop_geometry(tight[,c("LSOA11","area_group","area_supergroup","MeanDomElec_11_kWh", 
                             "MeanDomGas_11_kWh", 
                             "cars_percap", 
                             "miles_percap",
                             "T2W_Car.",
                             "pHeating_Gas",
                             "pHeating_Electric",
                             "pHeating_Other",
                             "median_household_income",
                             "mean_household.size",
                             "mean_rooms")]),"data/interview_areas_top_30p.csv", row.names = FALSE)

```

Finally We can go really tight. 33 LSOAs are in the top 10% for all three categories

```{r, echo=FALSE}
tight2  <- all_sub[all_sub$MeanDomElec_11_kWh > elec_decice[9] & all_sub$MeanDomGas_11_kWh > gas_decice[9] & all_sub$miles_percap > drive_decice[9], ]

tm_shape(tight2) +
  tm_polygons(col = "area_supergroup",
              popup.vars = c("MeanDomElec_11_kWh", 
                             "MeanDomGas_11_kWh", 
                             "cars_percap", 
                             "miles_percap",
                             "T2W_Car.",
                             "pHeating_Gas",
                             "pHeating_Electric",
                             "pHeating_Other",
                             "median_household_income",
                             "mean_household.size",
                             "mean_rooms",
                             "area_supergroup",
                             "area_group"
                             ))

# Export Data
nms <- read.csv("data-input/Energy Consumption/LSOA_domestic_gas_2017.csv.csv", stringsAsFactors = FALSE)
nms <- nms[,5:6]
names(nms) <- c("LSOA_Name","LSOA11")
nms <- nms[2:(nrow(nms)-1),]

tight2 <- left_join(tight2, nms, by = c("LSOA11"))
tight2 <- st_drop_geometry(tight2)
write.csv(tight2,"data/interview_areas.csv")
```