EduMod 0: 2-Estimate model, combine results

From InterSciWiki

Jump to: navigation, search

Contents

1 Background

2 Results

Results of running program 1 and 2 should look something like this (with somewhat different results each time program 2 is run if there are missing data -- imputed probabilistically):

 bbb
               coef Fstat         ddf pvalue   VIF  (p value < .05 is statistically significant)
 (Intercept) -4.903 0.196   2188980.2  0.658    NA
 fyll         1.577 6.697    152527.3  0.010 2.159  (language phylogeny effect)
 fydd        -0.440 1.156     52327.0  0.282 2.000  (+ effect of close societies, - distance)
 settype     -0.394 3.262 291235145.9  0.071 1.709  (- effect of more permanent settlements)
 cultints     0.659 4.036   4876578.4  0.045 1.974  (+ effect of intensity of cultivation)
 roots       -2.608 5.080   3291170.3  0.024 1.324  (- effect of gathering roots)
 foodtrade    0.087 4.300   6668341.2  0.038 1.181  (+ effect of more trade for food)
 exogamy     -0.970 6.930  13212783.5  0.008 1.142  (+ endogamy effect, - exogamy)
 femsubs      0.818 6.675   4301594.7  0.010 1.362  (+ female contribution to subsistence effect)
 fish         0.581 5.621  29623758.1  0.018 1.273
   
  r2
      
  R2:final model R2:IV(distance) R2:IV(language)
       0.1799310       0.9244904       0.9663661
   
  ccc
              Fstat         df pvalue
 RESET        1.356 4376804.12  0.244        (Non-signif specification error of regression)
 F on restrs. 0.434   11241.09  0.510        (Non-signif that dropped variables coef. > 0)
 NCV          0.495 1952208.21  0.482        (Non-signif residual error heteroskedasticity)
 SWnormal     1.455 1355722.96  0.228        (Non-signif residual error departure from normality)
 lagll        1.874 3118786.56  0.171        (Non-signif autocorrelated language residuals)
 lagdd        2.505  347263.62  0.113        (Non-signif autocorrelated distance residuals)

3

Program 2 (9-22-09) current Eff and Dow publication version

#MI--estimate model with network-lagged dependent variables, combine results
rm(list=ls(all=TRUE))
#--Set path to your directory with data and program--
###setwd("d:/projects/MI")###setwd("d:/projects/MI")
setwd("C:/My Documents/MI")
options(echo=TRUE)

#--need these packages for estimation and diagnostics--
library(foreign)
library(spdep)
library(car)
library(lmtest)
library(sandwich)

#-----------------------------
#--Read in data, rearrange----
#-----------------------------

#--Read in original SCCS data---
load("SCCS.Rdata",.GlobalEnv)
#--Read in two weight matrices--
ll<-as.matrix(read.dta("langwm.dta")[,-1])
dd<-as.matrix(read.dta("dist25wm.dta")[,c(-1,-2,-189)])
#--Read in the imputed dataset---
load("impdat.Rdata",.GlobalEnv)

#--create dep.varb. you wish to use from SCCS data--
#--Here we sum variables measuring how much a society values children--
#--can replace "sum" with "max"
childvalue<-apply(SCCS[,c("v473","v474","v475","v476")],1,sum) 
#--find obs. for which dep. varb. is non-missing--
zdv<-which(!is.na(childvalue))
childvalue<-childvalue[zdv]

#--look at frequencies and quartiles for the dep. varb.--
summary(childvalue)
table(childvalue)

#--modify weight matrices---
#--set diagonal equal to zeros--
diag(ll)<-0
diag(dd)<-0
#--use only obs. where dep. varb. non-missing--
ll<-ll[zdv,zdv]
dd<-dd[zdv,zdv]
#--row standardize (rows sum to one)
ll<-ll/rowSums(ll)
dd<-dd/rowSums(dd)
#--make weight matrix object for later autocorrelation test--
wmatll<-mat2listw(as.matrix(ll))
wmatdd<-mat2listw(as.matrix(dd))

indpv<-c("femsubs","foodscarc","exogamy","ncmallow","superjh","moralgods",
"fempower","sexratio","war","himilexp","wagelabor","famsize","settype",
"localjh","money","cultints","roots","cereals","gath","hunt","fish",
"anim","pigs","milk","plow","bovines","tree","foodtrade",
"ndrymonth","ecorich","popdens","pathstress","CVrain","rain",
"temp","AP1","AP2","migr","brideprice","nuclearfam","pctFemPolyg")

#-----------------------------------------------------
#---Estimate model on each imputed dataset------------
#-----------------------------------------------------

#--number of imputed datasets--
nimp<-10

#--will append values to these empty objects--
vif<-NULL
ss<-NULL
beta<-NULL
dng<-NULL

#--loop through the imputed datasets--
for (i in 1:nimp){

#--select the ith imputed dataset--
m9<-impdat[which(impdat$.imp==i),]
#--retain only obs. for which dep. varb. is nonmissing--
m9<-m9[zdv,]

#--create spatially lagged dep. varbs.--
y<-as.matrix(childvalue)
xx<-as.matrix(m9[,indpv])
#--for instruments we use the spatial lag of our indep. varbs.--
#--First, the spatially lagged varb. for distance--
xdy<-dd%*%xx
cyd<-dd%*%y
o<-lm(cyd~xdy)
#--the fitted value is our instrumental variable--
fydd<-fitted(o)
#--keep R2 from this regression--
dr2<-summary(o)$r.squared
#--Then, the spatially lagged varb. for language--
xly<-ll%*%xx   
cyl<-ll%*%y
o<-lm(cyl~xly)
#--the fitted value is our instrumental variable--
fyll<-fitted(o)
#--keep R2 from this regression--
lr2<-summary(o)$r.squared
m9<-cbind(m9,fydd,fyll)

#--OLS estimate of unrestricted model--
xUR<-lm(childvalue~fyll+fydd+
cultints+roots+cereals+gath+plow+
hunt+fish+anim+pigs+milk+bovines+tree+foodtrade+foodscarc+
ecorich+popdens+pathstress+exogamy+ncmallow+famsize+
settype+localjh+superjh+moralgods+fempower+femsubs+
sexratio+war+himilexp+money+wagelabor+
migr+brideprice+nuclearfam+pctFemPolyg
,data=m9)

#--OLS estimate of restricted model--
xR<-lm(childvalue ~ fyll + cultints + roots + fish + 
    exogamy + settype + femsubs, data = m9)

#--corrected sigma2 and R2 for 2SLS--
qxx<-m9
qxx[,"fydd"]<-cyd
qxx[,"fyll"]<-cyl
b<-coef(xR)
incpt<-matrix(1,NROW(qxx),1)
x<-as.matrix(cbind(incpt,qxx[,names(b)[-1]]))
e<-y-x%*%as.matrix(b)
cs2<-as.numeric(t(e)%*%e/(NROW(x)-NCOL(x)))
cr2<-as.numeric(1-t(e)%*%e/sum((y-mean(y))^2))

#--collect coefficients and their variances--
ov<-summary(xR)
vif<-rbind(vif,vif(xR))
ss<-rbind(ss,diag(ov$cov*cs2))
#--collect robust coef. variances when there is heteroskedasticity--
#eb<-e^2
#x<-as.matrix(cbind(incpt,m9[,names(b)[-1]]))
#hcm<-inv(t(x)%*%x)%*%t(x)%*%diag(eb[1:length(eb)])%*%x%*%inv(t(x)%*%x)
#ss<-rbind(ss,diag(hcm))
beta<-rbind(beta,coef(xR))

#--collect some model diagnostics--
dropt<-c("cereals","gath","plow","hunt","anim",
"pigs","milk","bovines","foodscarc","ecorich",
"popdens","pathstress","ncmallow","famsize","localjh",
"superjh","moralgods","fempower","sexratio","money",
"fydd","wagelabor","war","himilexp","tree","foodtrade")


#--Ramsey RESET test--
p1<-qchisq(resettest(xR,type="fitted")$"p.value",1,lower.tail=FALSE)
#--Wald test (H0: dropped variables have coefficient equal zero)--
o<-linear.hypothesis(xUR,dropt,test="Chisq")$"Pr(>Chisq)"[2]
p2<-qchisq(o,1,lower.tail=FALSE)
#--Heteroskedasticity test (H0: homoskedastic residuals)--
p3<-ncv.test(xR)$ChiSquare
#--Shapiro-Wilke normality test (H0: residuals normal)
p4<-qchisq(shapiro.test(residuals(xR))$p.value,1,lower.tail=FALSE)
#--LaGrange Multiplier test for spatial autocorrelation: language--
o<-lm.LMtests(xR, wmatll, test=c("LMlag"))
p5<-as.numeric(o$LMlag$statistic)
#--LaGrange Multiplier test for spatial autocorrelation: distance--
o<-lm.LMtests(xR, wmatdd, test=c("LMlag"))
p6<-as.numeric(o$LMlag$statistic)
#--model R2--
p7<-cr2
dng<-rbind(dng,cbind(p1,p2,p3,p4,p5,p6,p7,dr2,lr2))

}

#--------------------------------------------
#--Rubin's formulas for combining estimates--
#--------------------------------------------

#--first find final regr. coefs. and p-values--
mnb<-apply(beta,2,mean)
vrb<-colSums((beta-t(matrix(mnb,length(mnb),10)))^2)/(nimp-1)
mnv<-apply(ss,2,mean)
vrT<-mnv+vrb*(1-nimp^(-1))
fst<-mnb^2/vrT
r<-(1+nimp^(-1))*vrb/mnv
v<-(nimp-1)*(1+r^(-1))^2
pval<-pf(fst,1,v,lower.tail=FALSE)
bbb<-data.frame(round(cbind(mnb,fst,v,pval),3))
bbb$VIF[2:NROW(bbb)]<-round(apply(vif,2,mean),3)
names(bbb)<-c("coef","Fstat","ddf","pvalue","VIF")

#--Then combine the diagnostics we collected--
dng<-data.frame(dng)
names(dng)<-c("RESET","Wald on restrs.","NCV","SWnormal","lagll","lagdd",
"R2:final model","R2:IV(distance)","R2:IV(language)")
r2<-apply(dng[,7:9],2,mean)
adng<-dng[,1:6]
mdm<-apply(adng,2,mean)
vrd<-colSums((adng-t(matrix(mdm,length(mdm),nimp)))^2)/(nimp-1)
aa<-4*mdm^2-2*vrd
aa[which(aa<0)]<-0
rd<-(1+nimp^(-1))*vrd/(2*mdm+aa^.5)
vd<-(nimp-1)*(1+rd^(-1))^2
Dm<-(mdm-(nimp-1)/(nimp+1)*rd)/(1+rd)
#-All chi-sq we collected have df=1-------
pvald<-pf(Dm,1,vd,lower.tail=FALSE)
ccc<-data.frame(round(cbind(Dm,vd,pvald),3))
names(ccc)<-c("Fstat","df","pvalue")

bbb
r2
ccc

#--write results to csv file for perusal in spreadsheet--
write.csv("==OLS model for childvalue==",file="OLSresults.csv",append=FALSE)
write.csv(bbb,file="OLSresults.csv",append=TRUE)
write.csv(r2,file="OLSresults.csv",append=TRUE)
write.csv(ccc,file="OLSresults.csv",append=TRUE)