#Part 1 - code for Chapter 4 "Propensity Score Stratification" of book:
#Leite, W. L. (2017). Practical propensity score methods using R. 
#Thousand Oaks, CA: Sage. 
# 
#this is the code that was used to generate the example results in the book
#As the R software and the R packages used in this example are updated frequently
#some incompatibilities between the current code and new R versions or package versions
#may appear
#Any updates to the code will be posted at:
# http://www.practicalpropensityscore.com


#=========================================================
#this code is for estimating the effect of schools 
#having at least one full-time security guards
#on the frequency of harsh punishment
#using propensity score stratification
#using data from the School Survey on Crime and Safety

#Part 1 estimation of propensity scores

#Read dataset
load(file="Chapter4_ssocs08.Rdata")


#Get predictors of treatment assignment
#variable C0116 (metal detectors) was excluded because 97.4% of the schools 
#do not have students pass through metal detectors and only 1 of the untreated
#school have them pass through metal detectors
covariateNames <- names(SSOCS.data)[c(5:6,15:27,29,31,33,35:37,39:54,80:85,150:155,162:164,178,
                                      197:200, 417:422)]

#convert missing value indicators into binary variables
for (var in 252:414) {
SSOCS.data[,var] <- ifelse(SSOCS.data[,var] == 0, 0, 1) }

#obtain proportions of missing data for variables of interest
missingIndicatorNames <- paste("I", covariateNames, sep="")
missingIndicatorNames <- missingIndicatorNames[-c(53:57)] #remove composite variables
propMissing = apply(SSOCS.data[,missingIndicatorNames],2,mean)
missingIndicatorNames <- missingIndicatorNames[propMissing > 0.05]

#obtain imputation flags for variables of interest
#check whether any dummy missing indicators are redundant because
#variables have missing values for the same cases
missingCorrelations <- cor(SSOCS.data[,missingIndicatorNames])
diag(missingCorrelations) <- 0
maxCorrelations <- apply(missingCorrelations,2,max) 
dummyNAnames <- names(maxCorrelations)[maxCorrelations < 0.8]
maxCorrelationsHigh <- maxCorrelations[!duplicated(maxCorrelations)]
dummyNAnames <- c(dummyNAnames,names(maxCorrelationsHigh)[maxCorrelationsHigh>=0.8])



#create propensity score estimation formula
covariateNames <- c(covariateNames,dummyNAnames)
psFormula <- paste(covariateNames, collapse="+")
psFormula <- formula(paste("treat~",psFormula, sep=""))
print(psFormula)


#=================================
#estimate propensity scores with logistic regression
library(survey)
surveyDesign <- svydesign(ids=~0,strata=~STRATA,weights=~FINALWGT, data=SSOCS.data)


psModel <- svyglm(psFormula, surveyDesign, family=quasibinomial)

#add propensity scores to dataset
SSOCS.data$pScores <- fitted(psModel)


#evaluate common support with plots
tiff("Chapter4_figure4-1.tif", res=600, compression = "lzw", height=6, width=15, units="in")
hist(SSOCS.data$pScores[SSOCS.data$treat==0], density = 10, angle = 45, main="Propensity Scores",
     xlab="Shaded = Untreated | Gray = Treated") 
hist(SSOCS.data$pScores[SSOCS.data$treat==1], col=gray(0.4,0.25), add=T) 
dev.off()

#========================================
library(lattice)
tiff("Chapter4_figure4-2.tif", res=600, compression = "lzw", height=6, width=15, units="in")
bwplot( pScores~treat, data = SSOCS.data, 
        ylab = "Propensity Scores", auto.key = TRUE)
dev.off()

#obtain descriptive statistics
summary(SSOCS.data$pScores[SSOCS.data$treat==0])
summary(SSOCS.data$pScores[SSOCS.data$treat==1])


# #=====================================================
#Estimate propensity scores with GBM
library(twang)
#change the treatment indicator from factor to numeric, which is required by GBM
set.seed(2016)
SSOCS.data$treat <- as.numeric(as.character(SSOCS.data$treat)) 
myGBM <- ps(psFormula, data = SSOCS.data, n.trees=10000, interaction.depth=4,
            shrinkage=0.01, stop.method=c("es.max"), estimand = "ATT", 
            verbose=TRUE, sampw = SSOCS.data$FINALWGT)

#obtain information about balance achieved and iteration numbers
summary(myGBM)


#obtain tiff image
tiff("Chapter4_GBM_convergence.tif", res=600, compression = "lzw", height=6, width=15, units="in")
plot(myGBM,type="b", color=F)
dev.off()

#extract estimated propensity scores from object
pScoresGBM <- myGBM$ps
names(pScoresGBM) = "pScoresGBM"
SSOCS.data$pScoresGBM <- pScoresGBM

#save dataset with propensity scores and the vector of covariate names
save(list=c("SSOCS.data", "covariateNames"), file="Chapter4_ssocs08_with_propensity_scores.Rdata")