#code for Chapter 3 "Propensity Score Weighting" of book:
#Leite, W. L. (2016). Practical propensity score analysis using R.
#Thousand Oaks, CA: Sage.
#
#this is the code that was used to generate the example results in the book
#As the R software and the R packages used in this example are updated frequently
#some incompatibilities between the current code and new R versions or package versions
#may appear
#Any updates to the code will be posted at:
# http://www.practicalpropensityscore.com
#Estimate the effect of high school student participation in
#career academies on future income
#using data from the Education Longitudinal Study (ELS)
#load dataset
load(file="Chapter3_ELS_data_imputed_example_career_academy.Rdata")
#------------------------------------
#load required survey library
require(survey)
options(survey.lonely.psu = "adjust")
#normalize the base year student sampling weight so that it sums to the sample size
ELS.data.imputed$bystuwt <- ELS.data.imputed$bystuwt/mean(ELS.data.imputed$bystuwt)
#define design object that describes the sample characteristics
#the variable psu identifies the primary sampling units (cluster ids)
#the variable STRATA_ID identifies the strata ids
#the variable bystuwt identifies the base-year sampling weights for
#respondents of the 2002 and 2004 waves (Base year and 1st follow-up)
surveyDesign <- svydesign(ids=~psu, strata=~STRAT_ID, weights=~bystuwt,
data = ELS.data.imputed, nest=T)
#get weighted percentages of treated and untreated cases
#the treatment variable is "treat" - Ever in career academy from BY Student Questionnaire
(treatmentTable <- svymean(~treat, surveyDesign)) #8.8% of cases received treatment
#-------------------------------------------------
#obtain weights for estimating the ATT for propesnity scores obtained with logistic regression
ELS.data.imputed$weightATT <- with(ELS.data.imputed,ifelse(treat==1,
1, pScores/(1-pScores)))
with(ELS.data.imputed, by(weightATT,treat,summary))
#obtain weights for estimating the ATE
ELS.data.imputed$weightATE <- with(ELS.data.imputed,ifelse(treat==1,
1/pScores, 1/(1-pScores)))
with(ELS.data.imputed, by(weightATE,treat,summary))
#======================================================
#STRATEGIES TO DEAL WITH EXTREME WEIGHTS
#check the 99th percentile weight:
with(ELS.data.imputed,quantile(weightATE, 0.99))
#check how many observatios will have truncated weights
with(ELS.data.imputed,sum(weightATE>quantile(weightATE, 0.99)))
#truncate weights at the 99th percentile:
ELS.data.imputed$weightATETruncated = with(ELS.data.imputed,
ifelse(weightATE > quantile(weightATE, 0.99),
quantile(weightATE, 0.99),weightATE))
#check truncated weights
with(ELS.data.imputed, by(weightATETruncated,treat,summary))
#obtain stabilized weights for the estimation of ATE
ELS.data.imputed$C <- with(ELS.data.imputed,ifelse(treat==1,pScores,1-pScores))
surveyDesign <- svydesign(ids=~psu, strata=~STRAT_ID, weights=~bystuwt,
data = ELS.data.imputed, nest=T) #recreate survey design
(constants <- svyby(~C, by=~treat, design=surveyDesign, FUN=svymean))
ELS.data.imputed$stabilizedWeightATE <- ifelse(ELS.data.imputed$treat==1,
constants[1,2]/ELS.data.imputed$C,
constants[2,2]/ELS.data.imputed$C)
#check stabilized weigts for extremeness
with(ELS.data.imputed, by(stabilizedWeightATE,treat,summary))
#compare the distributions of weights and stabilized weights for the ATE
tiff("Chapter3_figure3-1.tif", res=600, compression = "lzw", height=6, width=15, units="in")
with(ELS.data.imputed,
hist(weightATE[treat==1], density = 10, angle = 45, main="Propensity Score Weights for the ATE",
xlab="Shaded = IPTW | Gray = Stabilized IPTW") )
with(ELS.data.imputed,
hist(stabilizedWeightATE[treat==1], col=gray(0.4,0.25), add=T) )
dev.off()
#=====================================================
#obtain weights with propensity scores estimated with random forests
#obtain weights for estimating the ATT for propesnity scores obtained with logistic regression
ELS.data.imputed$weightATTRf <- with(ELS.data.imputed,ifelse(treat==1,
1, pScoresRf/(1-pScoresRf)))
with(ELS.data.imputed, by(weightATTRf,treat,summary))
#obtain weights for estimating the ATE
ELS.data.imputed$weightATERf <- with(ELS.data.imputed,ifelse(treat==1,
1/pScoresRf, 1/(1-pScoresRf)))
with(ELS.data.imputed, by(weightATERf,treat,summary))
#======================================================
#obtain weights with propensity scores estimated with GBM
#obtain weights for estimating the ATT for propesnity scores obtained with logistic regression
ELS.data.imputed$weightATTGBM <- with(ELS.data.imputed,ifelse(treat==1,
1, pScoresGBM/(1-pScoresGBM)))
with(ELS.data.imputed, by(weightATTGBM,treat,summary))
#obtain weights for estimating the ATE
ELS.data.imputed$weightATEGBM <- with(ELS.data.imputed,ifelse(treat==1,
1/pScoresGBM, 1/(1-pScoresGBM)))
with(ELS.data.imputed, by(weightATEGBM,treat,summary))
#==========================================================
#obtain weights for multiple imputed datasets
library(mitools)
#add propensity score mean to imputed datasets
allImputations <- update(allImputations, weightATT = ifelse(treat==1,
1, pScores/(1-pScores)))
with(allImputations, by(weightATT,treat,summary))
#======================================================
#EVALUATE COVARIATE BALANCE
#create final weights for the base year
#the final weigth is the product of the propensity score weight
#and the sampling weight
ELS.data.imputed$finalWeightBY <- with(ELS.data.imputed,bystuwt*weightATT)
#evaluate covariate balance for ATT
require(twang)
balanceTable <- bal.stat(ELS.data.imputed, vars= covariateNames,
treat.var = "treat",
w.all = ELS.data.imputed$finalWeightBY, get.ks=F,
sampw = ELS.data.imputed$bystuwt,
estimand="ATT", multinom=F)
#Table with results of balance evaluation. The columns are:
# tx.mn - The mean of the treatment group
# tx.sd - The standard deviation of the treatment group
# ct.mn - The mean of the control group
# ct.sd - The standard deviation of the control group
# std.eff.sz - The standardized effect size, (tx.mn-ct.mn)/tx.sd.
# stat - the t-statistic for numeric variables and the chi-square statistic for categorical variables
# p - the p-value for the test associated with stat
balanceTable <- balanceTable$results
#summarize the covariate balance quality
(summaryBalance <- rbind(
std.eff.sz <- summary(abs(balanceTable$std.eff.sz)), #standardized effect sizes
#==============================================================
#create a custom function to return balance summaries
#the arguments are:
#data - the name of the data.frame containing the data.
#samplingWeight - the name of the variable containing the sampling weight, in quotes
#PSWeight - the name of the variable containing the propensity score weight, in quotes
#treatment - the of the treatment indicator variable, in quotes
#the type of treatment effect, either "ATT" or "ATE"
#covariateNames - the vector of covariate names
balanceSummarizer <- function(data, samplingWeight,
PSWeight, treatment,
effect, covariateNames) {
finalWeight <- data[,samplingWeight]*data[,PSWeight]
#evaluate covariate balance for ATT
require(twang)
balance.table <- bal.stat(data, vars= covariateNames,
treat.var = treatment,
w.all = finalWeight, get.ks=F,
sampw = data[,samplingWeight],
estimand=effect, multinom=F)
balance.table <- balance.table$results
#calculate variance ratio
balance.table$varRatio <- with(balance.table, tx.sd^2/ct.sd^2)
#summarize the covariate balance quality
return(rbind(
std.eff.sz = summary(abs(balance.table$std.eff.sz)), #standardized effect sizes
varRatio = summary(balance.table$varRatio) ))#variance ratios
#close function
}
#====================================================================
#obtain summaries of covariate balances with different propensity scores
#using the balanceSummarizer function I defined above.
Table3.1 <- rbind(
logistic = balanceSummarizer(data=ELS.data.imputed,PSWeight="weightATT", effect="ATT",
samplingWeight="bystuwt", treatment="treat",covariateNames=covariateNames),
RF = balanceSummarizer(data=ELS.data.imputed,PSWeight="weightATTRf", effect="ATT",
samplingWeight="bystuwt", treatment="treat",covariateNames=covariateNames),
GBM = balanceSummarizer(data=ELS.data.imputed,PSWeight="weightATTGBM",effect="ATT",
samplingWeight="bystuwt", treatment="treat",covariateNames=covariateNames))
Table3.1 <- data.frame(Table3.1,
index=rep(c("std.eff.sz","varRatio"),3), method=rep(c("Logistic","RF","GBM"),each=2))
write.csv(Table3.1, file="Table3-1.csv")
Table3.2 <- rbind(
logistic = balanceSummarizer(data=ELS.data.imputed,PSWeight="weightATE",effect="ATE",
samplingWeight="bystuwt", treatment="treat",covariateNames=covariateNames),
truncated = balanceSummarizer(data=ELS.data.imputed,PSWeight="weightATETruncated",effect="ATE",
samplingWeight="bystuwt", treatment="treat",covariateNames=covariateNames),
stabilized = balanceSummarizer(data=ELS.data.imputed,PSWeight="stabilizedWeightATE",effect="ATE",
samplingWeight="bystuwt", treatment="treat",covariateNames=covariateNames),
RF = balanceSummarizer(data=ELS.data.imputed,PSWeight="weightATERf",effect="ATE",
samplingWeight="bystuwt", treatment="treat",covariateNames=covariateNames),
GBM = balanceSummarizer(data=ELS.data.imputed,PSWeight="weightATEGBM",effect="ATE",
samplingWeight="bystuwt", treatment="treat",covariateNames=covariateNames))
Table3.2 = data.frame(Table3.2,
index=rep(c("std.eff.sz","varRatio"),5),
method=rep(c("Logistic","Truncated","Stabilized","RF","GBM"),each=2))
#write.csv(Table3.2, file="Table3-2.csv")
#===============================================================
#create a final weights for estimating the effect of career academy participation
#on income in the second folow up (2006)
ELS.data.imputed$finalWeight2006 <- with(ELS.data.imputed,bystuwt*weightATT)
#normalize the base-year to second folow-up (2006) final weight
ELS.data.imputed$finalWeight2006 <- ELS.data.imputed$finalWeight2006/mean(ELS.data.imputed$finalWeight2006)
#check distribution of final weights
summary(ELS.data.imputed$finalWeight2006)
#===========================================================
#ESTIMATE THE ATT
#re-create the survey design including the final weight for 2006
surveyDesign2006 <- svydesign(ids=~psu, strata=~STRAT_ID, weights=~finalWeight2006,
data = ELS.data.imputed, nest=T)
#create replicate weights for bootstrapping
surveyDesign2006Boot <- as.svrepdesign(surveyDesign2006, type=c("bootstrap"),replicates=1000)
#obtain ATT as weighted mean differences.
(weightedMeans <- svyby(formula=~F2ERN5P2,by=~treat,design=surveyDesign2006Boot,
FUN=svymean,covmat=TRUE))
(ATT2006 <- svycontrast(weightedMeans, contrasts=c(-1,1)))
#obtain the group variances
(weightedVars <- svyby(formula=~F2ERN5P2,by=~treat,design=surveyDesign2006Boot,
FUN=svyvar,covmat=TRUE))
#estimate the ATT for 2006 with regression analysis for complex survey data
outcomeModel2006 <- svyglm(F2ERN5P2~treat,surveyDesign2006)
summary(outcomeModel2006)
#re-estimate the ATT with regression, but this time obtain standard errors with bootstrapping
outcomeModel2006Boot <- svyglm(F2ERN5P2~treat,surveyDesign2006Boot)
summary(outcomeModel2006Boot)
#=====================================================================
#Estimate the ATT with multiple imputed datasets
#first, add the final weight to imputed datasets
allImputations <- update(allImputations, finalWeightATT = bystuwt*weightATT)
#normalize the final weight
allImputations <- update(allImputations, finalWeightATT = finalWeightATT/mean(finalWeightATT))
#create survey design object for multiple imputed dastests
surveyDesign2006MI <- svydesign(ids=~psu, strata=~STRAT_ID, weights=~finalWeightATT,
data = allImputations, nest=T)
#estimate the ATT with regression in each imputed dataset
outcomeModel2006MI <- with(surveyDesign2006MI, svyglm(F2ERN5P2~treat))
#combine estimates from multiplle imputed datasets
resultsModel2006MI <- MIcombine(outcomeModel2006MI)
summary(resultsModel2006MI)
#=====================================================================
#Doubly-robust (DR) estimation with regression estimation of
#propensity score weighted means using the propensity score as a covariate
#define survey design for treated observations
surveyDesign2006T <- subset(surveyDesign2006Boot, treat==1)
#define survey design for untreated observations
surveyDesign2006U <- subset(surveyDesign2006Boot, treat==0)
#fit model for treated and untreated observations
modelT <- svyglm(F2ERN5P2~pScores+I(pScores^2)+I(pScores^3),surveyDesign2006T)
modelU <- svyglm(F2ERN5P2~pScores+I(pScores^2)+I(pScores^3),surveyDesign2006U)
#obtain predicted outcomes with treated observations
Yt1 <- predict(modelT, newdata=data.frame(pScores=with(ELS.data.imputed,pScores[treat==1])),
vcov=TRUE, type="response")
Yt0 <- predict(modelU, newdata=data.frame(pScores=with(ELS.data.imputed,pScores[treat==1])),
vcov=TRUE, type="response")
#obtain the difference between predicted outcomes
diff <- Yt1 - Yt0
ATT2006DR <- svycontrast(diff, contrasts=rep(1/length(diff),length(diff)))
#======================================
#perform sensitivity analysis using Carnegie, Harada and Hill (2016) method
#run the sensitivity analysis with automatic selection of sensitivity parameters
library(treatSens)
sensitivity <- treatSens(F2ERN5P2~treat+pScores+I(pScores^2)+I(pScores^3),
trt.family = binomial(link="probit"),
grid.dim = c(5,5), nsim = 20, standardize = TRUE,
verbose = TRUE, data=ELS.data.imputed,
weights=ELS.data.imputed$finalWeight2006)
#obtain summary
summary(sensitivity)
#obtain plot of sensitivity analysis
#compare the distributions of weights and stabilized weights for the ATE
tiff("Chapter3_figure3-2.tif", res=600, compression = "lzw", height=6, width=15, units="in")
sensPlot(sensitivity, col.zero = "black", lty.zero = 2,
col.insig = "black", lty.insig = 3)
dev.off()
#===============================
#save all objects
#save(list=ls(), file="results_example__career_academy.Rdata", compress=T)