forked from NSAPH/GPSmatching
-
Notifications
You must be signed in to change notification settings - Fork 6
Open
Description
I'm using the code below
library(ggplot2)
library(CausalGPS)
library(memoise)
library(data.table)
# Load the data
data_file <- "xxx.csv"
data <- read.csv(data_file)
seed_val <- 249
set.seed(seed_val)
# set up the params
example <- "matching_1"
#------To reproduce the same results, do not change below this line.------------
if (example == "matching_1") {
p_q1 <- 0.01
p_q2 <- 0.99
delta_n <- 1.7
} else if (example == "matching_2"){
p_q1 <- 0.05
p_q2 <- 0.95
delta_n <- 0.9
} else if (example == "matching_3"){
p_q1 <- 0.10
p_q2 <- 0.90
delta_n <- 1.9
} else {
stop("Example type is not defined.")
}
sc_name <- example
ci_appr <- "matching"
gps_density <- "normal"
max_it <- 10
scale_val <- 1
# the range of my exposure factor is [1, 300]
exposure_min <- min(data$fields_num)
exposure_max <- max(data$fields_num)
# calculate the number of bins(Note: it's 79)
L <- ceiling((exposure_max - exposure_min) / (2 * delta_n))
# setup the confounders
confounders <- c("records_num", "views_num", "tables_num")
# add id column
data$id <- 1:nrow(data)
# generate pseudo population
start_time <- proc.time()
ps_pop_obj_1 <- generate_pseudo_pop(data[, c("id", "fields_num")],
data[, c("id", confounders)],
ci_appr = ci_appr,
gps_density = gps_density,
bin_seq = NULL,
exposure_trim_qtls = c(p_q1,
p_q2),
use_cov_transform = TRUE,
params = list(xgb_max_depth = c(3,4,5),
xgb_nrounds = seq(10, 40, 1)),
sl_lib = c("m_xgboost"),
nthread = 10,
covar_bl_method = "absolute",
covar_bl_trs = 0.1,
covar_bl_trs_type= "maximal",
max_attempt = max_it,
dist_measure = "l1",
delta_n = delta_n,
scale = scale_val)
end_time <- proc.time()
end_time - start_time
# covariate balance test
plot(ps_pop_obj_1, include_details = TRUE)
# get the matched set?(I'm not sure if it's correct, but the result is weird)
# the max value of fields_num(which is the exposure factor of my dataset is 43 and the min value is 6, which is very different from the original range [1, 300], I don't know the reason.
data <- merge(data[, c("id", "ttu")], ps_pop_obj_1$pseudo_pop, by = "id")
# estimate erf and plot
start_time <- proc.time()
m_Y <- as.numeric(data$ttu)
m_w <- as.numeric(data$fields_num)
counter_weight <- as.numeric(data$counter_weight)
erf_obj <- estimate_npmetric_erf(m_Y,
m_w,
counter_weight,
bw_seq = seq(0.2,2,0.2),
w_vals = seq(10, 40, 1),
nthread = 1)
end_time <- proc.time()
end_time - start_time
plot(erf_obj)- Question 1: How to get the matched dataset(I think it should be a LxN dimensions dataset, but the shape(of dataset I get from the code below) is (41092, 14)
data <- merge(data[, c("id", "ttu")], ps_pop_obj_1$pseudo_pop, by = "id")- Question 2: How to get the erf graph? When I ran the code below, I got the following error:
Error in checkForRemoteErrors(val) :
one node produced an error: NA/NaN/Inf in foreign function call (arg 4)
start_time <- proc.time()
m_Y <- as.numeric(data$ttu)
m_w <- as.numeric(data$fields_num)
counter_weight <- as.numeric(data$counter_weight)
erf_obj <- estimate_npmetric_erf(m_Y,
m_w,
counter_weight,
bw_seq = seq(0.2,2,0.2),
w_vals = seq(10, 40, 1),
nthread = 1)
end_time <- proc.time()
end_time - start_time
plot(erf_obj)- Question 3:Why the range of the exposure factor (In this case, it's the "fields_num")changed so much?
I think if the original range is [1, 300], and I get 79 bins using the caliper, what I can get is something like this: - medium exposure value of the first bin + outcome value
- medium exposure value of the second bin + outcome value
... - medium exposure value of the last bin + outcome value
so I can get 79 pairs of exposure value and outcome value, and then I use kernel smoother to get the erf graph.
I have read the papers below carefully, but I still can't get the answer of the questions above, I'm sorry if I missed them.
https://arxiv.org/pdf/1812.06575
https://arxiv.org/pdf/2310.00561
【Reference】
The result of summary(ps_pop_obj_1)
Number of data samples: 41092
Number of iterations: 1
Effective sample size:
Achieved: 3583.31032217326
Min recommended: 4109.2
As you can see above, I have 50000 samples, so I think the "Number of data samples" means part of the samples are trimmed.
I don't know what is the "Effective sample size" is.
But the covariate balance test is passed.
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels