#code for Chapter 5 "Propensity Score Matching" of book:
#Leite, W. L. (2017). Practical propensity score methods using R. 
#Thousand Oaks, CA: Sage. 

#PART 1 - PROPENSITY SCORE ESTIMATION

# 
#this is the code that was used to generate the example results in the book
#As the R software and the R packages used in this example are updated frequently
#some incompatibilities between the current code and new R versions or package versions
#may appear
#Any updates to the code will be posted at:
# http://www.practicalpropensityscore.com


#This example estimates the effect of mothers having a job 
#that provides or subsidizes child care 
#on the length that they breastfeed their children 
#National Longitudinal Survey of Youth 1979 (NLSY79) 
#and the NLSY79 Children and Youth

#load dataset
load("Chapter5_data_breastfeeding_example.rData")


#define propensity score formula
covariateNames <- c(
    "C0005300", #RACE OF CHILD (MOTHER'S RACIAL/ETHNIC COHORT FROM SCREENER)                CRACE          
    "C0005400", #SEX OF CHILD                                                         
     "C0270300", #NUMBER OF JOBS HELD BY MOTHER IN 4TH QTR BEFORE BIRTH OF CHILD             NJOBS01               
    "C0270600", #USUAL HOURS WORKED BY MOTHER AT ALL JOBS IN 4TH QTR BEFORE BIRTH OF CHILD  NHRJBS01       
    "C0270700", #USUAL EARNINGS OF MOTHER AT ALL JOBS IN 4TH QTR BEFORE BIRTH OF CHILD      EARNJB01       
    "C0271800", # COLLECTIVE BARGAINING SET MOTHER'S MAIN JOB WAGES, 4TH QTR BEFORE BIRTH OF CHILD  UNONMN01
    "C0328000", #LENGTH OF GESTATION OF CHILD IN WEEKS                                             PST0002        
    "C0328400", #CHILD DELIVERED BY CESAREAN SECTION?                                          PST0007        
    "C0329000", # DAYS MOTHER STAYED IN HOSPITAL AT BIRTH OF CHILD                             PST0013        
    "C0329100", # DAYS CHILD STAYED IN HOSPITAL AFTER DELIVERY                                 PST0014        
    "R0000700",    #COUNTRY OF BIRTH  OF MOTHER
    "R0618300",    #PROFILES ARMED FORCES QUALIFICATION TEST (AFQT) PERCENTILE SCORE - REVISED 1989 AFQT-2 
    "C0270200", #weeks after birth that mother returned to work (restricted to 12 weeks or less)
    "motherAge", #age of mother at child's birth
    "classWorker", #CLASS OF WORKER AT CURRENT JOB/MOST RECENT JOB
"familySize" , #FAMILY SIZE
"highestGrade" , #HIGHEST GRADE COMPLETED AS OF MAY 1 SURVEY YEAR (REVISED)
"jobsEver" , #NUMBER OF DIFFERENT JOBS EVER REPORTED AS OF INTERVIEW DATE
"maritalStatus" , #MARITAL STATUS (COLLAPSED)
"residenceRegion", #REGION OF CURRENT RESIDENCE
"ruralUrban"  , #IS R'S CURRENT RESIDENCE URBAN/RURAL?
"totalWelfare",  # TOTAL AMOUNT AFDC FOOD STAMPS OR OTH WELFARE/SSI RECEIVED DURING CAL YR 
"weeksWorked", #NUMBER OF WEEKS WORKED IN PAST CALENDAR YEAR
"hoursPerWeek", #hours per week worked in the last calendar year
"maternityLeave", #FRINGE BENEFITS CURRENT JOB/MOST RECENT JOB - MATERNITY/PATERNITY LEAVE 
"flexibleSchedule", #job allows flexible schedule
"argueChores",#FREQUENCY R & HUSBAND/PARTNER ARGUE ABOUT-CHORES & RESPONSIBILITIES
"dentalInsurance", #company provides dental insurance
"lifeInsurance", #company provides life insurance
"profitSharing", #company provides profit sharing
"retirement", #company provided retirement plan
"training") #company provided training opportunities

# #add missing value indicators to propensity score model
covariateNamesNA <- paste(covariateNames,"NA",sep="")
#restrict missing indicators to covariates with at least 5% of missing data
covariateNamesNA <- covariateNamesNA[apply(data[,covariateNamesNA],2,mean)>=0.05]

#merge covariate names with missing indicator names
covariateNames <- c(covariateNames, covariateNamesNA)

#obtain the propensity score formula   
psFormula <- paste(covariateNames, collapse="+")
psFormula <- formula(paste("childCare~",psFormula, sep=""))
print(psFormula)

#estimate propensity scores with logistic regression
psModel <- glm(psFormula, data, family=binomial())
data$logitPScores <- log(fitted(psModel)/(1-fitted(psModel)))

#estimate propensity scores with random forests
library(party)
set.seed(2015)
mycontrols <- cforest_unbiased(ntree=10000, mtry=5)
mycforest <- cforest(psFormula, data=data, controls=mycontrols)


#obtain a list of predicted probabilities
predictedProbabilities <- predict(mycforest, type="prob")

#organize the list into a matrix with two columns for the 
#probability of being in treated and control groups.
#keep only the second column, which are the propensity scores.
pScores2 <- matrix(unlist(predictedProbabilities),,2,byrow=T)[,2]

#convert propensity scores to logit
data$logitPScores2 <- log(pScores/(1-pScores))



#evaluate common support with plots
tiff("Chapter5_figure5-1.tif", res=600, compression = "lzw", height=6, width=15, units="in")
hist(data$logitPScores[data$childCare==0], density = 10, angle = 45, main="Propensity Scores",
     xlab="Shaded = Untreated | Gray = Treated") 
hist(data$logitPScores[data$childCare==1], col=gray(0.4,0.25), add=T) 
dev.off()


library(lattice)
tiff("Chapter5_figure5-2.tif", res=600, compression = "lzw", height=6, width=15, units="in")
bwplot( logitPScores~childCare, data = data, 
        ylab = "Propensity Scores", auto.key = TRUE)
dev.off()

#obtain descriptive statistics
summary(data$logitPScores[data$childCare==1])
summary(data$logitPScores[data$childCare==0])

#save data and other objects
save(list=c("covariateNames","covariateNamesNA", "data","psFormula"),
     file="Chapter5_data_with_propensity_scores_and_formula.rData", compress=T)