R version 4.4.1 (2024-06-14 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 10 x64 (build 19044)
Matrix products: default
locale:
[1] LC_COLLATE=Italian_Italy.utf8 LC_CTYPE=Italian_Italy.utf8 LC_MONETARY=Italian_Italy.utf8
[4] LC_NUMERIC=C LC_TIME=Italian_Italy.utf8
time zone: Europe/Rome
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] reticulate_1.45.0 SqlRender_1.19.4 PatientLevelPrediction_6.5.1 FeatureExtraction_3.12.0
[5] Andromeda_1.2.0 dplyr_1.2.0 CohortGenerator_1.1.0 R6_2.6.1
[9] DatabaseConnector_7.1.0 stringi_1.8.7 here_1.0.2
loaded via a namespace (and not attached):
[1] generics_0.1.4 RSQLite_2.4.6 lattice_0.22-6 magrittr_2.0.4 grid_4.4.1
[6] fastmap_1.2.0 blob_1.3.0 rprojroot_2.1.1 jsonlite_2.0.0 Matrix_1.7-0
[11] backports_1.5.0 DBI_1.3.0 survival_3.8-6 urltools_1.7.3.1 purrr_1.2.1
[16] duckdb_1.4.4 cli_3.6.5 rlang_1.1.7 dbplyr_2.5.2 triebeard_0.4.1
[21] splines_4.4.1 bit64_4.6.0-1 withr_3.0.2 cachem_1.1.0 otel_0.2.0
[26] tools_4.4.1 memuse_4.2-3 ParallelLogger_3.5.1 memoise_2.0.1 checkmate_2.3.4
[31] vctrs_0.7.1 png_0.1-8 lifecycle_1.0.5 bit_4.6.0 pkgconfig_2.0.3
[36] rJava_1.0-14 pillar_1.11.1 glue_1.8.0 Rcpp_1.1.1 tibble_3.3.1
[41] tidyselect_1.2.1
library(FeatureExtraction)
library(PatientLevelPrediction)
library(SqlRender)
# Define output folder ----
outputFolder <- here::here("path/to/folder")
# Create output folder if it doesn't exist
if (!file.exists(outputFolder)){
dir.create(outputFolder, recursive = TRUE)}
# Create a custom function to calculate the changes in laboratory test results
# for each hospital admission
getMeasurementDeltaCovariateData <- function(connection,
tempEmulationSchema = NULL,
cdmDatabaseSchema,
cohortTable = "#cohort_person",
cohortIds = -1,
cdmVersion = "5",
rowIdField = "visit_occurrence_id",
covariateSettings,
aggregated = FALSE,
minCharacterizationMean = 0.001) {
writeLines("Building custom delta measurement covariates...")
if (aggregated) {
stop("Aggregated not supported")
}
renderTranslateExecuteSql(connection,
sql = readSql(here("sql", "custom-covariates-SETUP.sql")),
vocabulary_database_schema = cdmDatabaseSchema,
cohort_table = cohortTable,
cohort_id_ricoveri = 2,
cdm_schema = cdmDatabaseSchema
)
covariates <- renderTranslateQuerySql(
connection,
sql = readSql(here("sql", "custom-covariates-SELECT.sql")),
snakeCaseToCamelCase = TRUE,
cdm_schema = cdmDatabaseSchema
)
covariateNames <- unique(
covariates[, c("covariateId", "conceptName", "measurementConceptId")]
)
# Remove conceptName
covariates <- covariates[, c("rowId", "covariateId", "covariateValue")]
# Create covariateRef
covariateRef <- data.frame(
covariateId = covariateNames$covariateId,
covariateName = paste0(
"Delta within 48h ±12h from baseline: ",
covariateNames$conceptName
),
analysisId = 10000,
conceptId = covariateNames$measurementConceptId
)
analysisRef <- data.frame(
analysisId = 10000,
analysisName = "Measurement Delta 48h ±12h",
domainId = "Measurement",
startDay = -3,
endDay = 0,
isBinary = "N",
missingMeansZero = "N"
)
# Create Andromeda object
result <- Andromeda::andromeda(
covariates = covariates,
covariateRef = covariateRef,
analysisRef = analysisRef
)
attr(result, "metaData") <- list(populationSize = length(unique(covariates$rowId)))
class(result) <- "CovariateData"
return(result)
}
# Create settings
createMeasurementDeltaSettings <- function() {
covariateSettings <- list(useMeasurementDelta = TRUE)
attr(covariateSettings, "fun") <- "getMeasurementDeltaCovariateData"
class(covariateSettings) <- "covariateSettings"
return(covariateSettings)
}
# ------------------------------------------------------------------------------
# Data and Feature Extraction
# ------------------------------------------------------------------------------
#Delta lab variables
MeasurementDelta <- createMeasurementDeltaSettings()
#test only for measurement_delta
covariateSettingsList <- list(MeasurementDelta)
covariateData <- getDbCovariateData(
connectionDetails = connectionDetails,
cdmDatabaseSchema = cdmDatabaseSchema,
cohortDatabaseSchema = cohortDatabaseSchema,
cohortTable = cohortTable,
cohortIds = 2,
rowIdField = "person_id",
covariateSettings = covariateSettingsList
)
summary(covariateData)
covariates_overview <- merge(covariateData$covariates, covariateData$covariateRef, by = "covariateId")
databaseDetails <- createDatabaseDetails(
connectionDetails = connectionDetails,
cdmDatabaseSchema = cdmDatabaseSchema,
cdmDatabaseName = cdmDatabaseName,
cohortDatabaseSchema = resultsDatabaseSchema,
cohortTable = cohortTable,
targetId = 2,
outcomeDatabaseSchema = resultsDatabaseSchema,
outcomeTable = cohortTable,
outcomeIds = 1,
cdmVersion = 5
)
restrictPlpDataSettings <- createRestrictPlpDataSettings()
plpData <- getPlpData(
databaseDetails = databaseDetails,
covariateSettings = covariateSettingsList,
restrictPlpDataSettings = restrictPlpDataSettings
)
# ------------------------------------------------------------------------------
# Model training (person split or time split)
# ------------------------------------------------------------------------------
populationSettings <- createStudyPopulationSettings(
removeSubjectsWithPriorOutcome = F)
splitSettings <- createDefaultSplitSetting(
trainFraction = 0.75,
testFraction = 0.25,
type = 'stratified',
nfold = 5,
splitSeed = 34568
)
sampleSettings <- createSampleSettings()
featureEngineeringSettings <- createSimpleImputer(method = "median", missingThreshold = 0.8)
preprocessSettings <- createPreprocessSettings(
minFraction = 0,
normalize = F,
removeRedundancy = F
)
# Set ML models
lrModel <- setLassoLogisticRegression()
lrResults <- runPlp(
plpData = plpData,
outcomeId = 1,
analysisId = 'singleDemo',
analysisName = 'Demonstration of runPlp for training single PLP models',
populationSettings = populationSettings,
splitSettings = splitSettings,
sampleSettings = sampleSettings,
featureEngineeringSettings = featureEngineeringSettings,
preprocessSettings = preprocessSettings,
modelSettings = lrModel,
logSettings = createLogSettings(),
executeSettings = createExecuteSettings(
runSplitData = T,
runSampleData = T,
runFeatureEngineering = T,
runPreprocessData = T,
runModelDevelopment = T,
runCovariateSummary = F
),
saveDirectory = file.path(getwd(), 'singlePlp'))
Describe the bug
We are conducting a study on hospital sepsis prediction at discharge.
Following the guidelines, we selected as covariates the variation of several laboratory tests, considering both baseline and 48/72-hour measurements.
Since these variables are not included among the package’s default covariates, we defined custom covariates.
However, when running the model, we encounter an error.
Our hypothesis is that the issue may be related to the data structure: our records represent visits rather than individual patients. As a result, the model may not be able to correctly associate multiple values from the same patient with the appropriate visit.
Set up (please run in R "sessionInfo()" and copy the output here):
To Reproduce
PLP Log File
plpLog.txt
Additional context
Cohorts JSON files:
Custom covariate files:
plpData[["cohorts"]]
plpData[["outcome"]]