bennettoxford · milanwiedemann · Feb 16, 2026 · Feb 10, 2026 · Feb 11, 2026 · Feb 14, 2026
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: opencodecounts
 Title: Clinical Code Usage in England
-Version: 0.2.0
+Version: 0.5.0
 Authors@R: c(
     person("Milan", "Wiedemann", email = "milan.wiedemann@gmail.com", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0003-1991-282X")),

diff --git a/R/data.R b/R/data.R
@@ -80,6 +80,36 @@ NULL
 #'   dplyr::filter(icd10_code %in% codelist$code)
 "icd10_usage"
 
+#' Yearly ICD-10 Code Usage Breakdowns from Hospital Admitted Patient Care Activity in England
+#'
+#' Yearly summary of 4-character ICD-10 code usage with demographic breakdowns
+#' from 1st April 2012 to 31st March 2025.
+#' Includes breakdowns by diagnosis type (all/main), sex, and age group.
+#' Restricted codes for which annual usage is not published have been removed.
+#' @format A data frame with 6 columns:
+#' \describe{
+#'   \item{start_date}{Start date of code usage count}
+#'   \item{end_date}{End date of code usage count}
+#'   \item{icd10_code}{The 4-character ICD-10 Code.
+#'   Note that the punctuation from the code has been removed for compatibility with OpenCodelists.}
+#'   \item{description}{Description of the ICD-10 Code}
+#'   \item{breakdown}{Type of breakdown: all_diagnoses, main_diagnosis, male, female,
+#'   gender_unknown, or age groups (age_0, age_1_4, age_5_9, ..., age_85_89, age_90plus)}
+#'   \item{usage}{Annual count of code usage. NA where suppressed due to small numbers.}
+#' }
+#' @source <https://digital.nhs.uk/data-and-information/publications/statistical/hospital-admitted-patient-care-activity>
+#' @examples
+#' # Compare male vs female usage for codes containing "pregnancy"
+#' icd10_usage_breakdowns |>
+#'   dplyr::filter(grepl("pregnancy", description, ignore.case = TRUE)) |>
+#'   dplyr::filter(breakdown %in% c("male", "female"))
+#'
+#' # Get age distribution for a specific code in the most recent year
+#' icd10_usage_breakdowns |>
+#'   dplyr::filter(icd10_code == "I251" & start_date == "2024-04-01") |>
+#'   dplyr::filter(grepl("^age_", breakdown))
+"icd10_usage_breakdowns"
+
 #' Yearly OPCS-4 Code Usage from Hospital Admitted Patient Care Activity in England
 #'
 #' Yearly summary of 4-character OPCS-4 code usage from 1st April 2013 to 31st March 2025.
@@ -100,3 +130,33 @@ NULL
 #' opcs4_usage |>
 #'   dplyr::filter(grepl("biopsy", description, ignore.case = TRUE) & lubridate::year(end_date) > 2020)
 "opcs4_usage"
+
+#' Yearly OPCS-4 Code Usage Breakdowns from Hospital Admitted Patient Care Activity in England
+#'
+#' Yearly summary of 4-character OPCS-4 code usage with demographic breakdowns
+#' from 1st April 2012 to 31st March 2025.
+#' Includes breakdowns by procedure type (all/main), sex, and age group.
+#' Restricted codes for which annual usage is not published have been removed.
+#' @format A data frame with 6 columns:
+#' \describe{
+#'   \item{start_date}{Start date of code usage count}
+#'   \item{end_date}{End date of code usage count}
+#'   \item{opcs4_code}{The 4-character OPCS-4 code.
+#'   Note that the punctuation from the code has been removed for compatibility with OpenCodelists.}
+#'   \item{description}{Description of the OPCS-4 Code}
+#'   \item{breakdown}{Type of breakdown: all_procedures, main_procedure, male, female,
+#'   gender_unknown, or age groups (age_0, age_1_4, age_5_9, ..., age_85_89, age_90plus)}
+#'   \item{usage}{Annual count of code usage. NA where suppressed due to small numbers.}
+#' }
+#' @source <https://digital.nhs.uk/data-and-information/publications/statistical/hospital-admitted-patient-care-activity>
+#' @examples
+#' # Get sex breakdown for hip replacement procedures
+#' opcs4_usage_breakdowns |>
+#'   dplyr::filter(grepl("hip replacement", description, ignore.case = TRUE)) |>
+#'   dplyr::filter(breakdown %in% c("male", "female"))
+#'
+#' # Get age distribution for a specific procedure code
+#' opcs4_usage_breakdowns |>
+#'   dplyr::filter(opcs4_code == "W371" & start_date == "2024-04-01") |>
+#'   dplyr::filter(grepl("^age_", breakdown))
+"opcs4_usage_breakdowns"
diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -3,11 +3,15 @@ template:
   bootstrap: 5
 
 reference:
-- title: Datasets
+- title: Datasets with total counts
   contents:
   - snomed_usage
   - icd10_usage
   - opcs4_usage
+- title: Datasets with breakdowns
+  contents:
+  - icd10_usage_breakdowns
+  - opcs4_usage_breakdowns
 - title: Codelists
   contents:
   - get_codelist

diff --git a/data-raw/icd10_usage_breakdowns.R b/data-raw/icd10_usage_breakdowns.R
@@ -0,0 +1,229 @@
+# This script loads OPCS-4 code usage data with demographic breakdowns
+# from files.digital.nhs.uk
+library(tidyverse)
+library(janitor)
+library(here)
+library(httr)
+
+# Using xlsx files because csv structure varies across years, xlsx stays consistent
+# All data from sheet "All Diagnoses 4 Character"
+
+url_start <- "https://files.digital.nhs.uk/"
+
+# Selects columns by name - will break if column names change spelling/order
+icd10_breakdowns_xlsx_urls <- list(
+  "fy24to25" = list(
+    url = paste0(
+      url_start,
+      "CC/EA025D/hosp-epis-stat-admi-diag-2024-25-tab.xlsx"
+    ),
+    sheet = 6,
+    range = "A11:AK11359"
+  ),
+  "fy23to24" = list(
+    url = paste0(
+      url_start,
+      "A5/5B8474/hosp-epis-stat-admi-diag-2023-24-tab.xlsx"
+    ),
+    sheet = 6,
+    range = "A11:AK11359"
+  ),
+  "fy22to23" = list(
+    url = paste0(
+      url_start,
+      "7A/DB1B00/hosp-epis-stat-admi-diag-2022-23-tab_V2.xlsx"
+    ),
+    sheet = 6,
+    range = "A11:AK11337"
+  ),
+  "fy21to22" = list(
+    url = paste0(
+      url_start,
+      "0E/E70963/hosp-epis-stat-admi-diag-2021-22-tab.xlsx"
+    ),
+    sheet = 6,
+    range = "A11:AK11341"
+  ),
+  "fy20to21" = list(
+    url = paste0(
+      url_start,
+      "5B/AD892C/hosp-epis-stat-admi-diag-2020-21-tab.xlsx"
+    ),
+    sheet = 6,
+    range = "A11:AK11218"
+  ),
+  "fy19to20" = list(
+    url = paste0(
+      url_start,
+      "37/8D9781/hosp-epis-stat-admi-diag-2019-20-tab%20supp.xlsx"
+    ),
+    sheet = 6,
+    range = "A11:AK11390"
+  ),
+  "fy18to19" = list(
+    url = paste0(
+      url_start,
+      "1C/B2AD9B/hosp-epis-stat-admi-diag-2018-19-tab.xlsx"
+    ),
+    sheet = 6,
+    range = "A11:AK11392"
+  ),
+  "fy17to18" = list(
+    url = paste0(
+      url_start,
+      "B2/5CEC8D/hosp-epis-stat-admi-diag-2017-18-tab.xlsx"
+    ),
+    sheet = 6,
+    range = "A11:AK11386"
+  ),
+  "fy16to17" = list(
+    url = paste0(
+      url_start,
+      "publication/7/d/hosp-epis-stat-admi-diag-2016-17-tab.xlsx"
+    ),
+    sheet = 6,
+    range = "A11:AK11418"
+  ),
+  "fy15to16" = list(
+    url = paste0(
+      url_start,
+      "publicationimport/pub22xxx/pub22378/hosp-epis-stat-admi-diag-2015-16-tab.xlsx"
+    ),
+    sheet = 6,
+    range = "A11:AK11353"
+  ),
+  "fy14to15" = list(
+    url = paste0(
+      url_start,
+      "publicationimport/pub19xxx/pub19124/hosp-epis-stat-admi-diag-2014-15-tab.xlsx"
+    ),
+    sheet = 6,
+    range = "A11:AK11345"
+  ),
+  "fy13to14" = list(
+    url = paste0(
+      url_start,
+      "publicationimport/pub16xxx/pub16719/hosp-epis-stat-admi-diag-2013-14-tab.xlsx"
+    ),
+    sheet = 6,
+    range = "A17:AF11357"
+  ),
+  "fy12to13" = list(
+    url = paste0(
+      url_start,
+      "publicationimport/pub12xxx/pub12566/hosp-epis-stat-admi-diag-2012-13-tab.xlsx"
+    ),
+    sheet = 6,
+    range = "A18:AF11400"
+  )
+)
+
+# Download xlsx from URL and read with cleaned column names
+read_icd10_usage_xlsx_from_url <- function(url_list, ...) {
+  temp_file <- tempfile(fileext = ".xlsx")
+  GET(
+    url_list$url,
+    write_disk(temp_file, overwrite = TRUE)
+  )
+  readxl::read_xlsx(
+    temp_file,
+    col_names = TRUE,
+    .name_repair = janitor::make_clean_names,
+    sheet = url_list$sheet,
+    range = url_list$range,
+    ...
+  )
+}
+
+# Download and read all xlsx files
+icd10_usage_raw_list <- icd10_breakdowns_xlsx_urls |>
+  map(read_icd10_usage_xlsx_from_url)
+
+# Check raw column names before cleaning
+icd10_usage_raw_list |>
+  map(names)
+
+# Select and standardise columns for all diagnoses breakdowns
+select_all_diag_breakdowns <- function(data) {
+  dplyr::select(
+    data,
+    icd10_code = 1,
+    description = 2,
+    c("all_diagnoses", "main_diagnosis"),
+    c("male", "female", "gender_unknown"),
+    starts_with("age")
+  ) |>
+    remove_empty("rows") |>
+    rename(age_90plus = age_90)
+}
+
+# Convert to numeric - suppressed counts ("-") become NA
+set_col_types <- function(data) {
+  suppressWarnings(
+    data |>
+      mutate(
+        across(c(icd10_code, description), as.character),
+        across(!c(icd10_code, description), as.numeric)
+      )
+  )
+}
+
+# Verify all years have identical column names after cleaning
+icd10_usage_raw_list |>
+  map(select_all_diag_breakdowns) |>
+  map(names) |>
+  unique()
+
+# Combine all years and parse fiscal year dates
+icd10_usage_breakdowns_long <- icd10_usage_raw_list |>
+  map(select_all_diag_breakdowns) |>
+  map(set_col_types) |>
+  bind_rows(.id = "nhs_fy") |>
+  separate(nhs_fy, c("start_date", "end_date"), "to") |>
+  mutate(
+    start_date = as.Date(
+      paste0("20", str_extract_all(start_date, "\\d+"), "-04-01")
+    ),
+    end_date = as.Date(
+      paste0("20", str_extract_all(end_date, "\\d+"), "-03-31")
+    ),
+    icd10_code = gsub("\\s?[^[:alnum:]]+\\s?", "", icd10_code)
+  )
+
+# Pivot breakdowns to long format
+icd10_usage_breakdowns <- icd10_usage_breakdowns_long |>
+  pivot_longer(
+    cols = all_diagnoses:age_90plus,
+    names_to = "breakdown",
+    values_to = "usage"
+  ) |>
+  dplyr::mutate(
+    usage = as.integer(usage)
+  )
+
+# Check codes with missing description
+icd10_usage_breakdowns |>
+  filter(is.na(description)) |>
+  select(icd10_code, description, usage) |>
+  distinct()
+
+# Remove "codes" with missing description
+icd10_usage_breakdowns <- icd10_usage_breakdowns |>
+  filter(!is.na(description))
+
+# Fix encoding problems
+icd10_usage_breakdowns <- icd10_usage_breakdowns |>
+  mutate(description = opencodecounts:::fix_encoding(description))
+
+# Check encoding problems after fix
+opencodecounts:::get_codes_with_encoding_problems(
+  icd10_usage_breakdowns,
+  icd10_code
+)
+# character(0)
+
+usethis::use_data(
+  icd10_usage_breakdowns,
+  compress = "bzip2",
+  overwrite = TRUE
+)