Estimate model, combine results

From InterSciWiki

Jump to: navigation, search

Contents

[edit] 1 Background

Return to Imputing data for Regression Analysis

[edit] 2 Results

Results of running program 1 and 2 should look something like this (with somewhat different results each time program 2 is run if there are missing data -- imputed probabilistically):

 bbb
               coef Fstat         ddf pvalue   VIF  (p value < .05 is statistically significant)
 (Intercept) -4.903 0.196   2188980.2  0.658    NA
 fyll         1.577 6.697    152527.3  0.010 2.159  (language phylogeny effect)
 fydd        -0.440 1.156     52327.0  0.282 2.000  (+ effect of close societies, - distance)
 settype     -0.394 3.262 291235145.9  0.071 1.709  (- effect of more permanent settlements)
 cultints     0.659 4.036   4876578.4  0.045 1.974  (+ effect of intensity of cultivation)
 roots       -2.608 5.080   3291170.3  0.024 1.324  (- effect of gathering roots)
 foodtrade    0.087 4.300   6668341.2  0.038 1.181  (+ effect of more trade for food)
 exogamy     -0.970 6.930  13212783.5  0.008 1.142  (+ endogamy effect, - exogamy)
 femsubs      0.818 6.675   4301594.7  0.010 1.362  (+ female contribution to subsistence effect)
 fish         0.581 5.621  29623758.1  0.018 1.273
   
  r2
      
  R2:final model R2:IV(distance) R2:IV(language)
       0.1799310       0.9244904       0.9663661
   
  ccc
              Fstat         df pvalue
 RESET        1.356 4376804.12  0.244        (Non-signif specification error of regression)
 F on restrs. 0.434   11241.09  0.510        (Non-signif that dropped variables coef. > 0)
 NCV          0.495 1952208.21  0.482        (Non-signif residual error heteroskedasticity)
 SWnormal     1.455 1355722.96  0.228        (Non-signif residual error departure from normality)
 lagll        1.874 3118786.56  0.171        (Non-signif autocorrelated language residuals)
 lagdd        2.505  347263.62  0.113        (Non-signif autocorrelated distance residuals)

[edit] 3

Anthon Eff: The R2:IV(distance) R2:IV(language) are the goodness of fit of the instrumental variables for each of the stage 1 regressions -- the regressions creating the (IV) instrumental variables for the network-lagged dependent variables. These R2's are not usually reported, but I think they are useful to know--just to see how well the instrument fits the original (endogenous) network-lagged dependent variable.

Two things are done with regression models: 1) test hypotheses by performing inferences on the estimated parameters; 2) produce fitted values (as is done in a forecast). The regression in which the instrumental variable (IV) is created falls into the second category: we seek only the fitted value, and we have no intention of performing inferences on the estimated coefficients. The model in fact is not specified in a way that we can trust our inferences—it is atheoretical, where we throw a bunch of exogenous independent variables together just to get a fitted value of Wy that is both a good fit and is exogenous. For this reason, no one reports the estimated coefficients —- to do so would simply clutter up a paper with numbers that don’t really mean anything.

On the other hand, it makes sense to have some indication of the goodness of fit of the instrumental variable (IV) to the original Wy. Therefore I added to Program 2 code that will calculate the R^2 of the IV regressions.

[edit] Program 2 (9-19-2009) obsolete

#September 17, 2009, Anthon Eff
#MI--estimate model, combine results

rm(list=ls(all=TRUE))
#--Set path to your directory with data and program--
#setwd("d:/projects/MI")
setwd("c:/My Documents/MI")
options(echo=TRUE)

#--need these packages for estimation and diagnostics--
library(foreign)
library(spdep)
library(car)
library(lmtest)
library(sandwich)

#-----------------------------
#--Read in data, rearrange----
#-----------------------------

#--Read in original SCCS data---
load(url("http://frank.mtsu.edu/~eaeff/downloads/SCCS.Rdata"),.GlobalEnv)
#--Read in two weight matrices--
###ll<-as.matrix(read.dta("http://frank.mtsu.edu/~eaeff/downloads/langwm.dta")[,-1])
###changed to 
ll<-as.matrix(read.dta("langwm.dta")[,-1])
###dd<-as.matrix(read.dta("http://frank.mtsu.edu/~eaeff/downloads/dist25wm.dta")[,c(-1,-2,-189)])
###changed to 
dd<-as.matrix(read.dta("dist25wm.dta")[,c(-1,-2,-189)])
#--Read in the imputed dataset---
load(url("http://frank.mtsu.edu/~eaeff/downloads/impdat.Rdata"),.GlobalEnv)

#--create dep.varb. you wish to use from SCCS data--
#--Here we sum variables measuring how much a society values children--
depvar<-apply(SCCS[,c("v473","v474","v475","v476")],1,sum) #can replace "sum" with
"max"
#--find obs. for which dep. varb. is non-missing--
zdv<-which(!is.na(depvar))
depvar<-depvar[zdv]

#--look at histogram and frequencies for the dep. varb.--
hist(depvar)
table(depvar)

#--modify weight matrices---
#--set diagonal equal to zeros--
diag(ll)<-0
diag(dd)<-0
#--use only obs. where dep. varb. non-missing--
ll<-ll[zdv,zdv]
dd<-dd[zdv,zdv]
#--row standardize (rows sum to one)
ll<-ll/rowSums(ll)
dd<-dd/rowSums(dd)
#--check to see that rows sum to one
rowSums(ll)
rowSums(dd)
#--make weight matrix object for later autocorrelation test--
wmatll<-mat2listw(as.matrix(ll))
wmatdd<-mat2listw(as.matrix(dd))


indpv<-c("femsubs","foodscarc",
"exogamy","ncmallow","superjh","moralgods","fempower","sexratio",
"war","himilexp","wagelabor",
"famsize","settype","localjh","money",
"cultints","roots","cereals","gath","hunt","fish",
"anim","pigs","milk","plow","bovines","tree","foodtrade",
"ndrymonth","ecorich",
"popdens","pathstress","CVrain","rain","temp","AP1","AP2")


#-----------------------------------------------------
#---Estimate model on each imputed dataset------------
#-----------------------------------------------------

#--number of imputed datasets--
nimp<-10

#--will append values to these empty objects--
vif<-NULL
ss<-NULL
beta<-NULL
dng<-NULL

#--loop through the imputed datasets--
for (i in 1:nimp){

#--select the ith imputed dataset--
m9<-impdat[which(impdat$.imp==i),]
#--retain only obs. for which dep. varb. is nonmissing--
m9<-m9[zdv,]

#--create spatially lagged dep. varbs.--
y<-as.matrix(depvar)
xx<-as.matrix(m9[,indpv])
#--for instruments we use the spatial lag of our indep. varbs.--
#--First, the spatially lagged varb. for distance--
xdy<-dd%*%xx
cy<-dd%*%y
o<-lm(cy~xdy)
#--the fitted value is our instrumental variable--
fydd<-fitted(o)
#--keep R2 from this regression--
dr2<-summary(o)$r.squared
#--Then, the spatially lagged varb. for language--
xdy<-ll%*%xx   
cy<-ll%*%y
o<-lm(cy~xdy)
#--the fitted value is our instrumental variable--
fyll<-fitted(o)
#--keep R2 from this regression--
lr2<-summary(o)$r.squared
m9<-cbind(m9,fydd,fyll)

#--OLS estimate of unrestricted model--
xUR<-lm(depvar~fyll+fydd+
cultints+roots+cereals+gath+plow+
hunt+fish+anim+pigs+milk+bovines+tree+foodtrade+foodscarc+
ecorich+popdens+pathstress+exogamy+ncmallow+famsize+
settype+localjh+superjh+moralgods+fempower+femsubs+
sexratio+war+himilexp+money+wagelabor
,data=m9)

#--OLS estimate of restricted model--
xR<-lm(depvar~fyll+fydd+settype+cultints+
roots+foodtrade+exogamy+
femsubs+fish, data=m9)

#--collect coefficients and their variances--
ov<-summary(xR)
vif<-rbind(vif,vif(xR))
ss<-rbind(ss,diag(ov$cov*ov$sigma^2))
#--collect robust coef. variances when there is heteroskedasticity--
#ss<-rbind(ss,diag(hccm(xR)))
beta<-rbind(beta,coef(xR))

#--collect some model diagnostics--
dropt<-c("cereals","gath","plow","hunt","anim",
"pigs","milk","bovines","foodscarc","ecorich",
"popdens","pathstress","ncmallow","famsize","localjh",
"superjh","moralgods","fempower","sexratio","money",
"wagelabor","war","himilexp","tree")

#--Ramsey RESET test--
p1<-qchisq(resettest(xR,type="fitted")$"p.value",1,lower.tail=FALSE)
#--F test that dropped variables had coefficient equal zero--
o<-linear.hypothesis(xUR,dropt,white.adjust=TRUE)$"Pr(>F)"[2]
p2<-qchisq(o,1,lower.tail=FALSE)
#--Heteroskedasticity test (H0: homoskedastic residuals)--
p3<-ncv.test(xR)$ChiSquare
#--Shapiro-Wilke normality test (H0: residuals normal)
p4<-qchisq(shapiro.test(residuals(xR))$p.value,1,lower.tail=FALSE)
#--LaGrange Multiplier test for spatial autocorrelation: language--
o<-lm.LMtests(xR, wmatll, test=c("LMlag"))
p5<-as.numeric(o$LMlag$statistic)
#--LaGrange Multiplier test for spatial autocorrelation: distance--
o<-lm.LMtests(xR, wmatdd, test=c("LMlag"))
p6<-as.numeric(o$LMlag$statistic)
#--model R2--
p7<-ov$r.squared
dng<-rbind(dng,cbind(p1,p2,p3,p4,p5,p6,p7,dr2,lr2))

}


#--------------------------------------------
#--Rubin's formulas for combining estimates--
#--------------------------------------------

#--first find final regr. coefs. and p-values--
mnb<-apply(beta,2,mean)
vrb<-colSums((beta-t(matrix(mnb,length(mnb),10)))^2)/(nimp-1)
mnv<-apply(ss,2,mean)
vrT<-mnv+vrb*(1-nimp^(-1))
fst<-mnb^2/vrT
r<-(1+nimp^(-1))*vrb/mnv
v<-(nimp-1)*(1+r^(-1))^2
pval<-pf(fst,1,v,lower.tail=FALSE)
bbb<-data.frame(round(cbind(mnb,fst,v,pval),3))
bbb$VIF[2:NROW(bbb)]<-round(apply(vif,2,mean),3)
names(bbb)<-c("coef","Fstat","ddf","pvalue","VIF")

#--Then combine the diagnostics we collected--
dng<-data.frame(dng)
names(dng)<-c("RESET","F on restrs.","NCV","SWnormal","lagll","lagdd",
"R2:final model","R2:IV(distance)","R2:IV(language)")
r2<-apply(dng[,7:9],2,mean)
adng<-dng[,1:6]
mdm<-apply(adng,2,mean)
vrd<-colSums((adng-t(matrix(mdm,length(mdm),nimp)))^2)/(nimp-1)
aa<-4*mdm^2-2*vrd
aa[which(aa<0)]<-0
rd<-(1+nimp^(-1))*vrd/(2*mdm+aa^.5)
vd<-(nimp-1)*(1+rd^(-1))^2
Dm<-(mdm-(nimp-1)/(nimp+1)*rd)/(1+rd)
#-All chi-sq we collected have df=1-------
pvald<-pf(Dm,1,vd,lower.tail=FALSE)
ccc<-data.frame(round(cbind(Dm,vd,pvald),3))
names(ccc)<-c("Fstat","df","pvalue")

bbb
r2
ccc

#--write results to csv file for perusal in spreadsheet--
write.csv("==OLS model for depvar==",file="OLSresults.csv",append=FALSE)
write.csv(bbb,file="OLSresults.csv",append=TRUE)
write.csv(r2,file="OLSresults.csv",append=TRUE)
write.csv(ccc,file="OLSresults.csv",append=TRUE)

[edit] Program 2 (9-22-2009) current

#MI--estimate model with network-lagged dependent variables, combine results
rm(list=ls(all=TRUE))
#--Set path to your directory with data and program--
###setwd("d:/projects/MI")###setwd("d:/projects/MI")
setwd("C:/My Documents/MI")
options(echo=TRUE)

#--need these packages for estimation and diagnostics--
library(foreign)
library(spdep)
library(car)
library(lmtest)
library(sandwich)

#-----------------------------
#--Read in data, rearrange----
#-----------------------------

#--Read in original SCCS data---
load("SCCS.Rdata",.GlobalEnv)
#--Read in two weight matrices--
ll<-as.matrix(read.dta("langwm.dta")[,-1])
dd<-as.matrix(read.dta("dist25wm.dta")[,c(-1,-2,-189)])
#--Read in the imputed dataset---
load("impdat.Rdata",.GlobalEnv)

#--create dep.varb. you wish to use from SCCS data--
#--Here we sum variables measuring how much a society values children--
#--can replace "sum" with "max"
childvalue<-apply(SCCS[,c("v473","v474","v475","v476")],1,sum) 
#--find obs. for which dep. varb. is non-missing--
zdv<-which(!is.na(childvalue))
childvalue<-childvalue[zdv]

#--look at frequencies and quartiles for the dep. varb.--
summary(childvalue)
table(childvalue)

#--modify weight matrices---
#--set diagonal equal to zeros--
diag(ll)<-0
diag(dd)<-0
#--use only obs. where dep. varb. non-missing--
ll<-ll[zdv,zdv]
dd<-dd[zdv,zdv]
#--row standardize (rows sum to one)
ll<-ll/rowSums(ll)
dd<-dd/rowSums(dd)
#--make weight matrix object for later autocorrelation test--
wmatll<-mat2listw(as.matrix(ll))
wmatdd<-mat2listw(as.matrix(dd))

indpv<-c("femsubs","foodscarc","exogamy","ncmallow","superjh","moralgods",
"fempower","sexratio","war","himilexp","wagelabor","famsize","settype",
"localjh","money","cultints","roots","cereals","gath","hunt","fish",
"anim","pigs","milk","plow","bovines","tree","foodtrade",
"ndrymonth","ecorich","popdens","pathstress","CVrain","rain",
"temp","AP1","AP2","migr","brideprice","nuclearfam","pctFemPolyg")

#-----------------------------------------------------
#---Estimate model on each imputed dataset------------
#-----------------------------------------------------

#--number of imputed datasets--
nimp<-10

#--will append values to these empty objects--
vif<-NULL
ss<-NULL
beta<-NULL
dng<-NULL

#--loop through the imputed datasets--
for (i in 1:nimp){

#--select the ith imputed dataset--
m9<-impdat[which(impdat$.imp==i),]
#--retain only obs. for which dep. varb. is nonmissing--
m9<-m9[zdv,]

#--create spatially lagged dep. varbs.--
y<-as.matrix(childvalue)
xx<-as.matrix(m9[,indpv])
#--for instruments we use the spatial lag of our indep. varbs.--
#--First, the spatially lagged varb. for distance--
xdy<-dd%*%xx
cyd<-dd%*%y
o<-lm(cyd~xdy)
#--the fitted value is our instrumental variable--
fydd<-fitted(o)
#--keep R2 from this regression--
dr2<-summary(o)$r.squared
#--Then, the spatially lagged varb. for language--
xly<-ll%*%xx   
cyl<-ll%*%y
o<-lm(cyl~xly)
#--the fitted value is our instrumental variable--
fyll<-fitted(o)
#--keep R2 from this regression--
lr2<-summary(o)$r.squared
m9<-cbind(m9,fydd,fyll)

#--OLS estimate of unrestricted model--
xUR<-lm(childvalue~fyll+fydd+
cultints+roots+cereals+gath+plow+
hunt+fish+anim+pigs+milk+bovines+tree+foodtrade+foodscarc+
ecorich+popdens+pathstress+exogamy+ncmallow+famsize+
settype+localjh+superjh+moralgods+fempower+femsubs+
sexratio+war+himilexp+money+wagelabor+
migr+brideprice+nuclearfam+pctFemPolyg
,data=m9)

#--OLS estimate of restricted model--
xR<-lm(childvalue ~ fyll + cultints + roots + fish + 
    exogamy + settype + femsubs, data = m9)

#--corrected sigma2 and R2 for 2SLS--
qxx<-m9
qxx[,"fydd"]<-cyd
qxx[,"fyll"]<-cyl
b<-coef(xR)
incpt<-matrix(1,NROW(qxx),1)
x<-as.matrix(cbind(incpt,qxx[,names(b)[-1]]))
e<-y-x%*%as.matrix(b)
cs2<-as.numeric(t(e)%*%e/(NROW(x)-NCOL(x)))
cr2<-as.numeric(1-t(e)%*%e/sum((y-mean(y))^2))

#--collect coefficients and their variances--
ov<-summary(xR)
vif<-rbind(vif,vif(xR))
ss<-rbind(ss,diag(ov$cov*cs2))
#--collect robust coef. variances when there is heteroskedasticity--
#eb<-e^2
#x<-as.matrix(cbind(incpt,m9[,names(b)[-1]]))
#hcm<-inv(t(x)%*%x)%*%t(x)%*%diag(eb[1:length(eb)])%*%x%*%inv(t(x)%*%x)
#ss<-rbind(ss,diag(hcm))
beta<-rbind(beta,coef(xR))

#--collect some model diagnostics--
dropt<-c("cereals","gath","plow","hunt","anim",
"pigs","milk","bovines","foodscarc","ecorich",
"popdens","pathstress","ncmallow","famsize","localjh",
"superjh","moralgods","fempower","sexratio","money",
"fydd","wagelabor","war","himilexp","tree","foodtrade")


#--Ramsey RESET test--
p1<-qchisq(resettest(xR,type="fitted")$"p.value",1,lower.tail=FALSE)
#--Wald test (H0: dropped variables have coefficient equal zero)--
o<-linear.hypothesis(xUR,dropt,test="Chisq")$"Pr(>Chisq)"[2]
p2<-qchisq(o,1,lower.tail=FALSE)
#--Heteroskedasticity test (H0: homoskedastic residuals)--
p3<-ncv.test(xR)$ChiSquare
#--Shapiro-Wilke normality test (H0: residuals normal)
p4<-qchisq(shapiro.test(residuals(xR))$p.value,1,lower.tail=FALSE)
#--LaGrange Multiplier test for spatial autocorrelation: language--
o<-lm.LMtests(xR, wmatll, test=c("LMlag"))
p5<-as.numeric(o$LMlag$statistic)
#--LaGrange Multiplier test for spatial autocorrelation: distance--
o<-lm.LMtests(xR, wmatdd, test=c("LMlag"))
p6<-as.numeric(o$LMlag$statistic)
#--model R2--
p7<-cr2
dng<-rbind(dng,cbind(p1,p2,p3,p4,p5,p6,p7,dr2,lr2))

}

#--------------------------------------------
#--Rubin's formulas for combining estimates--
#--------------------------------------------

#--first find final regr. coefs. and p-values--
mnb<-apply(beta,2,mean)
vrb<-colSums((beta-t(matrix(mnb,length(mnb),10)))^2)/(nimp-1)
mnv<-apply(ss,2,mean)
vrT<-mnv+vrb*(1-nimp^(-1))
fst<-mnb^2/vrT
r<-(1+nimp^(-1))*vrb/mnv
v<-(nimp-1)*(1+r^(-1))^2
pval<-pf(fst,1,v,lower.tail=FALSE)
bbb<-data.frame(round(cbind(mnb,fst,v,pval),3))
bbb$VIF[2:NROW(bbb)]<-round(apply(vif,2,mean),3)
names(bbb)<-c("coef","Fstat","ddf","pvalue","VIF")

#--Then combine the diagnostics we collected--
dng<-data.frame(dng)
names(dng)<-c("RESET","Wald on restrs.","NCV","SWnormal","lagll","lagdd",
"R2:final model","R2:IV(distance)","R2:IV(language)")
r2<-apply(dng[,7:9],2,mean)
adng<-dng[,1:6]
mdm<-apply(adng,2,mean)
vrd<-colSums((adng-t(matrix(mdm,length(mdm),nimp)))^2)/(nimp-1)
aa<-4*mdm^2-2*vrd
aa[which(aa<0)]<-0
rd<-(1+nimp^(-1))*vrd/(2*mdm+aa^.5)
vd<-(nimp-1)*(1+rd^(-1))^2
Dm<-(mdm-(nimp-1)/(nimp+1)*rd)/(1+rd)
#-All chi-sq we collected have df=1-------
pvald<-pf(Dm,1,vd,lower.tail=FALSE)
ccc<-data.frame(round(cbind(Dm,vd,pvald),3))
names(ccc)<-c("Fstat","df","pvalue")

bbb
r2
ccc

#--write results to csv file for perusal in spreadsheet--
write.csv("==OLS model for childvalue==",file="OLSresults.csv",append=FALSE)
write.csv(bbb,file="OLSresults.csv",append=TRUE)
write.csv(r2,file="OLSresults.csv",append=TRUE)
write.csv(ccc,file="OLSresults.csv",append=TRUE)