##code for Chapter 7 "Propensity Score Methods for Continuous Treatment Doses" of book:
#Leite, W. L. (2017). Practical propensity score methods using R. 
#Thousand Oaks, CA: Sage. 
# 
#this is the code that was used to generate the example results in the book
#As the R software and the R packages used in this example are updated frequently
#some incompatibilities between the current code and new R versions or package versions
#may appear
#Any updates to the code will be posted at:
# http://www.practicalpropensityscore.com

#PART 1B GENERALIZED PROPENSITY SCORES AND DOSE RESPONSE FUNCTION FOR CONTINOUS TREATMENTS
#USING A GAMMA DISTRIBUTION

#this example estimates the effect of school participation 
#in the Algebra Nation virtual learning environment 
#on the mean student scores on Florida's Algebra I End-of-Course (EOC) Assessment.

#load data
data <- read.csv("Chapter7_data_Algebra_Nation.csv")

#standardize continuous covariates 
#(this is to help interpret covariate balance measures because they will be in standard units)
data$numOfStud2014 <- scale(data$numOfStud2014)
data$meanScale2012 <- scale(data$meanScale2012)
data$lev1Perc2012 <- scale(data$lev1Perc2012)
data$lev5Perc2012 <- scale(data$lev5Perc2012)
data$perc.free.lunch <- scale(data$perc.free.lunch)
data$perc.reduced.lunch <- scale(data$perc.reduced.lunch)

#convert binary variables to factors
data$SeniorHigh <- factor(data$SeniorHigh)
data$middleHigh <- factor(data$middleHigh)

#breakdown locale into size and type variables
data$locationSize <- with(data, ifelse(Locale.=="City: Large" |
     Locale.=="Suburb: Large", "Large", ifelse( Locale.=="City: Midsize"|
    Locale.=="Suburb: Midsize", "Midsize", "Small")))  
data$locationSize <- factor(data$locationSize)

data$locationRural <- with(data, ifelse(Locale.=="Rural: Distant" |
            Locale.=="Rural: Fringe" |
            Locale.=="Rural: Remote", "Rural", "Urban"))  
data$locationRural <- factor(data$locationRural)

#view treatment dose, which in this case is logins per examinee
hist(data$loginsPerExaminee)

#given that the loginsPerExaminee has outliers, trim the top 5%
data = subset(data, loginsPerExaminee<quantile(data$loginsPerExaminee,0.95))
hist(data$loginsPerExaminee)

#=====================================================
#define  covariates
covariateNames <- c("Charter", #dummy indicator of charter school
                   "Magnet.", #dummy indicator of magnet school
                   "Title.I.School.", #dummy indicator of title 1 school
                   "locationRural",  #rural location
                   "locationSize", #size of location (large, midsize, small)
                   "Students.", #total number of students in 2012
                   "SeniorHigh" ,    #dummy indicator of whether it is a senior high school (the reference group is high school                          
                   "numOfStud2014", #number of test takers in 2014 
                   "meanScale2012", #mean scaled scores in 2012
                   "lev1Perc2012",#percent achieving level 1 in 2012
                   "lev5Perc2012", #percent achieving level 5 in 2012
                   "perc.free.lunch", #percent free lunch in 2012
                   "perc.reduced.lunch") #percent reduced lunch in 2012

#define model
formulaDose <- formula("loginsPerExaminee~Charter+Magnet.+
                           Title.I.School.+locationRural+locationSize+Students.+
                        SeniorHigh+numOfStud2014+meanScale2012+lev1Perc2012+
                        lev5Perc2012+perc.free.lunch+perc.reduced.lunch")

#fit regression model for treatment doses
modelDoses <- glm(formula=formulaDose, data=data, family=Gamma(link = "inverse"))
summary(modelDoses)

#estimate generalized propensity score with density of gamma distribution
#to obtain the density, we need the shape and rate
#summary(modelDoses) reports the dispersion, and the shape is 1/dispersion
#the dispersion is the variance of the fitted values
shape = 1/var(fitted(modelDoses))
#the rate is shape/mean
rate = shape/(1/mean(fitted(modelDoses)))
data$GPS <- dgamma(data$loginsPerExaminee, 
            shape=shape, rate=rate)
#compare modeled density with histogram
with(data, plot(loginsPerExaminee, GPS))
library(lattice)
xyplot(GPS~loginsPerExaminee, data)
densityplot(~loginsPerExaminee, data)

#======================================================
#evaluate covariate balance using strata of the GPS
data$strataGPS <- with(data,cut(GPS, include.lowest = T,labels=1:5,
                                 breaks = quantile(GPS,probs=seq(0,1,0.2))))


balanceTable <- data.frame() #storage
#loop through variables
for (var in 1:length(covariateNames)) {
    
    #formula with treatment dose as a function of covariate
    balanceFormula <- paste("loginsPerExaminee~strataGPS+",covariateNames[var],sep="")
    
    #regress dose on covariate without weights
    maxEff <- max(abs(coef(glm(balanceFormula,data=data, family=Gamma(link = "inverse"))))[-(1:5)])
   
    balanceTable <- rbind(balanceTable,c(var,maxEff))
    
} #close loop

#put  variable names on table
names(balanceTable) <- c("variable","coef") 
balanceTable$variable <- covariateNames
#standardize coefficients with respect to sd of outcome
balanceTable$coef <- balanceTable$coef/sd(data$loginsPerExaminee)



#fit the outcome model
library(survey)
designAN <- svydesign(id=~1,weights=~1,data =data)


modelOutcome <- svyglm(formula="meanScale2014~loginsPerExaminee + I(loginsPerExaminee^2) + 
                                          GPS + I(GPS^2) + loginsPerExaminee:GPS", design=designAN)
summary(modelOutcome)

#Estimate the treatment effect at percentiles of GPS
all.effects <- data.frame()#storage

#loop through percentiles of dosage from 1% to 99%
for (dose in quantile(data$loginsPerExaminee,probs=seq(0.01,1,0.01)) ) {

  #predict outcome given all the GPS for a fixed value of dosage
effects <- predict(modelOutcome,type="response", vcov=T,
                  newdata=data.frame(loginsPerExaminee=dose, GPS=data$GPS)  )
effect <- svycontrast(effects,rep(1/nrow(data),nrow(data)))
all.effects <- rbind(all.effects,data.frame(effect)) #accumulate results
                } #close loop


#create a dataset of percentile of doses and responses
doseResponses <- data.frame(percentile = seq(1,100,1),
                           loginsPerExaminee=quantile(data$loginsPerExaminee,probs=seq(0.01,1,0.01)),
                           all.effects)

names(doseResponses)[3:4] <- c("meanScale2014", "SE")

#calculate confidence intervals
doseResponses$lowerCL <- with(doseResponses, meanScale2014 - 1.96*SE)
doseResponses$upperCL <- with(doseResponses, meanScale2014 + 1.96*SE)


#plot dose-response function
with(doseResponses, plot(loginsPerExaminee,meanScale2014, main="", type="b",
                         xlab="Log logins per examinee",ylab="Mean Algebra I 2014") )
with(doseResponses, lines(loginsPerExaminee,lowerCL, lty = 'dashed'))
with(doseResponses, lines(loginsPerExaminee,upperCL, lty = 'dashed'))