Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: opencodecounts
Title: Clinical Code Usage in England
Version: 0.2.0
Version: 0.5.0
Authors@R: c(
person("Milan", "Wiedemann", email = "milan.wiedemann@gmail.com", role = c("aut", "cre"),
comment = c(ORCID = "0000-0003-1991-282X")),
Expand Down
60 changes: 60 additions & 0 deletions R/data.R
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,36 @@ NULL
#' dplyr::filter(icd10_code %in% codelist$code)
"icd10_usage"

#' Yearly ICD-10 Code Usage Breakdowns from Hospital Admitted Patient Care Activity in England
#'
#' Yearly summary of 4-character ICD-10 code usage with demographic breakdowns
#' from 1st April 2012 to 31st March 2025.
#' Includes breakdowns by diagnosis type (all/main), sex, and age group.
#' Restricted codes for which annual usage is not published have been removed.
#' @format A data frame with 6 columns:
#' \describe{
#' \item{start_date}{Start date of code usage count}
#' \item{end_date}{End date of code usage count}
#' \item{icd10_code}{The 4-character ICD-10 Code.
#' Note that the punctuation from the code has been removed for compatibility with OpenCodelists.}
#' \item{description}{Description of the ICD-10 Code}
#' \item{breakdown}{Type of breakdown: all_diagnoses, main_diagnosis, male, female,
#' gender_unknown, or age groups (age_0, age_1_4, age_5_9, ..., age_85_89, age_90plus)}
#' \item{usage}{Annual count of code usage. NA where suppressed due to small numbers.}
#' }
#' @source <https://digital.nhs.uk/data-and-information/publications/statistical/hospital-admitted-patient-care-activity>
#' @examples
#' # Compare male vs female usage for codes containing "pregnancy"
#' icd10_usage_breakdowns |>
#' dplyr::filter(grepl("pregnancy", description, ignore.case = TRUE)) |>
#' dplyr::filter(breakdown %in% c("male", "female"))
#'
#' # Get age distribution for a specific code in the most recent year
#' icd10_usage_breakdowns |>
#' dplyr::filter(icd10_code == "I251" & start_date == "2024-04-01") |>
#' dplyr::filter(grepl("^age_", breakdown))
"icd10_usage_breakdowns"

#' Yearly OPCS-4 Code Usage from Hospital Admitted Patient Care Activity in England
#'
#' Yearly summary of 4-character OPCS-4 code usage from 1st April 2013 to 31st March 2025.
Expand All @@ -100,3 +130,33 @@ NULL
#' opcs4_usage |>
#' dplyr::filter(grepl("biopsy", description, ignore.case = TRUE) & lubridate::year(end_date) > 2020)
"opcs4_usage"

#' Yearly OPCS-4 Code Usage Breakdowns from Hospital Admitted Patient Care Activity in England
#'
#' Yearly summary of 4-character OPCS-4 code usage with demographic breakdowns
#' from 1st April 2012 to 31st March 2025.
#' Includes breakdowns by procedure type (all/main), sex, and age group.
#' Restricted codes for which annual usage is not published have been removed.
#' @format A data frame with 6 columns:
#' \describe{
#' \item{start_date}{Start date of code usage count}
#' \item{end_date}{End date of code usage count}
#' \item{opcs4_code}{The 4-character OPCS-4 code.
#' Note that the punctuation from the code has been removed for compatibility with OpenCodelists.}
#' \item{description}{Description of the OPCS-4 Code}
#' \item{breakdown}{Type of breakdown: all_procedures, main_procedure, male, female,
#' gender_unknown, or age groups (age_0, age_1_4, age_5_9, ..., age_85_89, age_90plus)}
#' \item{usage}{Annual count of code usage. NA where suppressed due to small numbers.}
#' }
#' @source <https://digital.nhs.uk/data-and-information/publications/statistical/hospital-admitted-patient-care-activity>
#' @examples
#' # Get sex breakdown for hip replacement procedures
#' opcs4_usage_breakdowns |>
#' dplyr::filter(grepl("hip replacement", description, ignore.case = TRUE)) |>
#' dplyr::filter(breakdown %in% c("male", "female"))
#'
#' # Get age distribution for a specific procedure code
#' opcs4_usage_breakdowns |>
#' dplyr::filter(opcs4_code == "W371" & start_date == "2024-04-01") |>
#' dplyr::filter(grepl("^age_", breakdown))
"opcs4_usage_breakdowns"
6 changes: 5 additions & 1 deletion _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,15 @@ template:
bootstrap: 5

reference:
- title: Datasets
- title: Datasets with total counts
contents:
- snomed_usage
- icd10_usage
- opcs4_usage
- title: Datasets with breakdowns
contents:
- icd10_usage_breakdowns
- opcs4_usage_breakdowns
- title: Codelists
contents:
- get_codelist
Expand Down
229 changes: 229 additions & 0 deletions data-raw/icd10_usage_breakdowns.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
# This script loads OPCS-4 code usage data with demographic breakdowns
# from files.digital.nhs.uk
library(tidyverse)
library(janitor)
library(here)
library(httr)

# Using xlsx files because csv structure varies across years, xlsx stays consistent
# All data from sheet "All Diagnoses 4 Character"

url_start <- "https://files.digital.nhs.uk/"

# Selects columns by name - will break if column names change spelling/order
icd10_breakdowns_xlsx_urls <- list(
"fy24to25" = list(
url = paste0(
url_start,
"CC/EA025D/hosp-epis-stat-admi-diag-2024-25-tab.xlsx"
),
sheet = 6,
range = "A11:AK11359"
),
"fy23to24" = list(
url = paste0(
url_start,
"A5/5B8474/hosp-epis-stat-admi-diag-2023-24-tab.xlsx"
),
sheet = 6,
range = "A11:AK11359"
),
"fy22to23" = list(
url = paste0(
url_start,
"7A/DB1B00/hosp-epis-stat-admi-diag-2022-23-tab_V2.xlsx"
),
sheet = 6,
range = "A11:AK11337"
),
"fy21to22" = list(
url = paste0(
url_start,
"0E/E70963/hosp-epis-stat-admi-diag-2021-22-tab.xlsx"
),
sheet = 6,
range = "A11:AK11341"
),
"fy20to21" = list(
url = paste0(
url_start,
"5B/AD892C/hosp-epis-stat-admi-diag-2020-21-tab.xlsx"
),
sheet = 6,
range = "A11:AK11218"
),
"fy19to20" = list(
url = paste0(
url_start,
"37/8D9781/hosp-epis-stat-admi-diag-2019-20-tab%20supp.xlsx"
),
sheet = 6,
range = "A11:AK11390"
),
"fy18to19" = list(
url = paste0(
url_start,
"1C/B2AD9B/hosp-epis-stat-admi-diag-2018-19-tab.xlsx"
),
sheet = 6,
range = "A11:AK11392"
),
"fy17to18" = list(
url = paste0(
url_start,
"B2/5CEC8D/hosp-epis-stat-admi-diag-2017-18-tab.xlsx"
),
sheet = 6,
range = "A11:AK11386"
),
"fy16to17" = list(
url = paste0(
url_start,
"publication/7/d/hosp-epis-stat-admi-diag-2016-17-tab.xlsx"
),
sheet = 6,
range = "A11:AK11418"
),
"fy15to16" = list(
url = paste0(
url_start,
"publicationimport/pub22xxx/pub22378/hosp-epis-stat-admi-diag-2015-16-tab.xlsx"
),
sheet = 6,
range = "A11:AK11353"
),
"fy14to15" = list(
url = paste0(
url_start,
"publicationimport/pub19xxx/pub19124/hosp-epis-stat-admi-diag-2014-15-tab.xlsx"
),
sheet = 6,
range = "A11:AK11345"
),
"fy13to14" = list(
url = paste0(
url_start,
"publicationimport/pub16xxx/pub16719/hosp-epis-stat-admi-diag-2013-14-tab.xlsx"
),
sheet = 6,
range = "A17:AF11357"
),
"fy12to13" = list(
url = paste0(
url_start,
"publicationimport/pub12xxx/pub12566/hosp-epis-stat-admi-diag-2012-13-tab.xlsx"
),
sheet = 6,
range = "A18:AF11400"
)
)

# Download xlsx from URL and read with cleaned column names
read_icd10_usage_xlsx_from_url <- function(url_list, ...) {
temp_file <- tempfile(fileext = ".xlsx")
GET(
url_list$url,
write_disk(temp_file, overwrite = TRUE)
)
readxl::read_xlsx(
temp_file,
col_names = TRUE,
.name_repair = janitor::make_clean_names,
sheet = url_list$sheet,
range = url_list$range,
...
)
}

# Download and read all xlsx files
icd10_usage_raw_list <- icd10_breakdowns_xlsx_urls |>
map(read_icd10_usage_xlsx_from_url)

# Check raw column names before cleaning
icd10_usage_raw_list |>
map(names)

# Select and standardise columns for all diagnoses breakdowns
select_all_diag_breakdowns <- function(data) {
dplyr::select(
data,
icd10_code = 1,
description = 2,
c("all_diagnoses", "main_diagnosis"),
c("male", "female", "gender_unknown"),
starts_with("age")
) |>
remove_empty("rows") |>
rename(age_90plus = age_90)
}

# Convert to numeric - suppressed counts ("-") become NA
set_col_types <- function(data) {
suppressWarnings(
data |>
mutate(
across(c(icd10_code, description), as.character),
across(!c(icd10_code, description), as.numeric)
)
)
}

# Verify all years have identical column names after cleaning
icd10_usage_raw_list |>
map(select_all_diag_breakdowns) |>
map(names) |>
unique()

# Combine all years and parse fiscal year dates
icd10_usage_breakdowns_long <- icd10_usage_raw_list |>
map(select_all_diag_breakdowns) |>
map(set_col_types) |>
bind_rows(.id = "nhs_fy") |>
separate(nhs_fy, c("start_date", "end_date"), "to") |>
mutate(
start_date = as.Date(
paste0("20", str_extract_all(start_date, "\\d+"), "-04-01")
),
end_date = as.Date(
paste0("20", str_extract_all(end_date, "\\d+"), "-03-31")
),
icd10_code = gsub("\\s?[^[:alnum:]]+\\s?", "", icd10_code)
)

# Pivot breakdowns to long format
icd10_usage_breakdowns <- icd10_usage_breakdowns_long |>
pivot_longer(
cols = all_diagnoses:age_90plus,
names_to = "breakdown",
values_to = "usage"
) |>
dplyr::mutate(
usage = as.integer(usage)
)

# Check codes with missing description
icd10_usage_breakdowns |>
filter(is.na(description)) |>
select(icd10_code, description, usage) |>
distinct()

# Remove "codes" with missing description
icd10_usage_breakdowns <- icd10_usage_breakdowns |>
filter(!is.na(description))

# Fix encoding problems
icd10_usage_breakdowns <- icd10_usage_breakdowns |>
mutate(description = opencodecounts:::fix_encoding(description))

# Check encoding problems after fix
opencodecounts:::get_codes_with_encoding_problems(
icd10_usage_breakdowns,
icd10_code
)
# character(0)

usethis::use_data(
icd10_usage_breakdowns,
compress = "bzip2",
overwrite = TRUE
)
Loading