EduMod 4: Estimate model, combine results

From InterSciWiki

Jump to: navigation, search
#MI--estimate model, combine results
rm(list=ls(all=TRUE))
#--Set path to your directory with data and program--
setwd("C:/Program Files/R/R-2.6.2/ImputeTest")
options(echo=TRUE)
#--need these packages for estimation and diagnostics--
library(foreign)
library(spdep)
library(car)
library(lmtest)
library(sandwich)
#-----------------------------
#--Read in data, rearrange----
#-----------------------------
#--Read in original SCCS data---
load("SCCS.Rdata",.GlobalEnv)
#--find obs. for which dep. varb. is non-missing--
valchild=(SCCS$v473+SCCS$v474+SCCS$v475+SCCS$v476)
zdv<-which(!is.na(valchild))
#--Read in the imputed dataset---
load("impdat.Rdata",.GlobalEnv)
#--look at summary of impdat--
summary(impdat)
#--look at histogram for the dep. varb.--
hist(impdat$valchild)
#--weight matrix for language phylogeny, remove first column---
ll<-as.matrix(read.dta("langwm.dta")[,-1])
#--set diagonal equal to zeros--
ll<-ll-ll*diag(NROW(ll))
#--use only obs. where dep. varb. non-missing--
ll<-ll[zdv,zdv]
#--row standardize (rows sum to one)
ll<-ll/rowSums(ll)
#--check to see that rows sum to one
rowSums(ll)
#--make weight matrix object for later autocorrelation test--
wmatll<-mat2listw(as.matrix(ll))
#--weight matrix for geographic distance (remove three columns)--
dd<-as.matrix(read.dta("dist25wm.dta")[,c(-1,-2,-189)])
dd<-dd-dd*diag(NROW(dd))
dd<-dd[zdv,zdv]
dd<-dd/rowSums(dd)
wmatdd<-mat2listw(as.matrix(dd))
indpv<-c("femsubs","foodscarc",
"exogamy","ncmallow","superjh","moralgods","fempower","sexratio",
"war","himilexp","wagelabor",
"famsize","settype","localjh","money",
"cultints","roots","cereals","gath","hunt","fish",
"anim","pigs","milk","plow","bovines","tree","foodtrade",
"ndrymonth","ecorich",
"popdens","pathstress","CVrain","rain","temp","AP1","AP2")


#-----------------------------------------------------
#---Estimate model on each imputed dataset------------
#-----------------------------------------------------
#--number of imputed datasets--
nimp<-10
#--will append values to these empty objects--
vif<-NULL
ss<-NULL
beta<-NULL
dng<-NULL
#--loop through the imputed datasets--
for (i in 1:nimp){
#--select the ith imputed dataset--
m9<-impdat[which(impdat$.imp==i),]
#--retain only obs. for which dep. varb. is nonmissing--
m9<-m9[zdv,]
#--create spatially lagged dep. varbs.--
y<-as.matrix(m9$valchild)
xx<-as.matrix(m9[,indpv])
#--for instruments we use the spatial lag of our indep. varbs.--
#--First, the spatially lagged varb. for distance--
xdy<-cbind(as.matrix(dd)%*%xx)
cy<-as.matrix(dd)%*%y
fydd<-fitted(lm(cy~xdy))
#--Then, the spatially lagged varb. for language--
xdy<-cbind(as.matrix(ll)%*%xx)
cy<-as.matrix(ll)%*%y
fyll<-fitted(lm(cy~xdy))
m9<-cbind(m9,fydd,fyll)
#--OLS estimate of unrestricted model--
xUR<-lm(valchild~fyll+fydd+
cultints+roots+cereals+gath+plow+
hunt+fish+anim+pigs+milk+bovines+tree+foodtrade+foodscarc+
ecorich+popdens+pathstress+exogamy+ncmallow+famsize+
settype+localjh+superjh+moralgods+fempower+femsubs+
sexratio+war+himilexp+money+wagelabor
,data=m9)
#--OLS estimate of restricted model--
xR<-lm(valchild~fyll+fydd+
cultints+roots+tree+foodtrade+exogamy+
settype+femsubs+himilexp+fish, data=m9)
#--collect coefficients and their variances--
ov<-summary(xR)
vif<-rbind(vif,vif(xR))
ss<-rbind(ss,diag(ov$cov*ov$sigma^2))
#--collect robust coef. variances when there is heteroskedasticity--
#ss<-rbind(ss,diag(hccm(xR)))
beta<-rbind(beta,coef(xR))
#--collect some model diagnostics--
dropt<-c("cereals","gath","plow","hunt","anim",
"pigs","milk","bovines","foodscarc","ecorich",
"popdens","pathstress","ncmallow","famsize","localjh",
"superjh","moralgods","fempower","sexratio","money",
"wagelabor","war")
#--model R2--
p1<-ov$r.squared
#--Ramsey RESET test--
p2<-qchisq(resettest(xR,type="fitted")$"p.value",1,lower.tail=FALSE)
#--F test that dropped variables had coefficient equal zero--
o<-linear.hypothesis(xUR,dropt,white.adjust=TRUE)$"Pr(>F)"[2]
p3<-qchisq(o,1,lower.tail=FALSE)
#--Heteroskedasticity test (H0: homoskedastic residuals)--
p4<-ncv.test(xR)$ChiSquare
#--Shapiro-Wilke normality test (H0: residuals normal)
p5<-qchisq(shapiro.test(residuals(xR))$p.value,1,lower.tail=FALSE)
#--LaGrange Multiplier test for spatial autocorrelation: language--
o<-lm.LMtests(xR, wmatll, test=c("LMlag"))
p6<-as.numeric(o$LMlag$statistic)
#--LaGrange Multiplier test for spatial autocorrelation: distance--
o<-lm.LMtests(xR, wmatdd, test=c("LMlag"))
p7<-as.numeric(o$LMlag$statistic)
dng<-rbind(dng,cbind(p1,p2,p3,p4,p5,p6,p7))
}


#--------------------------------------------
#--Rubin's formulas for combining estimates--
#--------------------------------------------
#--first find final regr. coefs. and p-values--
mnb<-apply(beta,2,mean)
vrb<-colSums((beta-t(matrix(mnb,length(mnb),10)))^2)/(nimp-1)
mnv<-apply(ss,2,mean)
vrT<-mnv+vrb*(1-nimp^(-1))
fst<-mnb^2/vrT
r<-(1+nimp^(-1))*vrb/mnv
v<-(nimp-1)*(1+r^(-1))^2
pval<-pf(fst,1,v,lower.tail=FALSE)
bbb<-data.frame(round(cbind(mnb,fst,v,pval),3))
bbb$VIF[2:NROW(bbb)]<-round(apply(vif,2,mean),3)
names(bbb)<-c("coef","Fstat","ddf","pvalue","VIF")
#--Then combine the diagnostics we collected--
dng<-data.frame(dng)
names(dng)<-c("R2","RESET","F on restrs.","NCV","SWnormal","lagll","lagdd")
r2<-mean(dng[,1])
adng<-dng[,2:NCOL(dng)]
mdm<-apply(adng,2,mean)
vrd<-colSums((adng-t(matrix(mdm,length(mdm),nimp)))^2)/(nimp-1)
aa<-4*mdm^2-2*vrd
aa[which(aa<0)]<-0
rd<-(1+nimp^(-1))*vrd/(2*mdm+aa^.5)
vd<-(nimp-1)*(1+rd^(-1))^2
Dm<-(mdm-(nimp-1)/(nimp+1)*rd)/(1+rd)
#-All chi-sq we collected have df=1-------
pvald<-pf(Dm,1,vd,lower.tail=FALSE)
ccc<-data.frame(round(cbind(Dm,vd,pvald),3))
names(ccc)<-c("Fstat","df","pvalue")
bbb
r2
ccc
#--write results to csv file for perusal in spreadsheet--
write.csv("==OLS model for valchild==",file="valchild.csv",append=FALSE)
write.csv(bbb,file="valchild.csv",append=TRUE)
write.csv(r2,file="valchild.csv",append=TRUE)
write.csv(ccc,file="valchild.csv",append=TRUE)