Estimate model, combine results
From InterSciWiki
Contents |
[edit] 1 Background
Return to Imputing data for Regression Analysis
[edit] 2 Results
Results of running program 1 and 2 should look something like this (with somewhat different results each time program 2 is run if there are missing data -- imputed probabilistically):
bbb
coef Fstat ddf pvalue VIF (p value < .05 is statistically significant)
(Intercept) -4.903 0.196 2188980.2 0.658 NA
fyll 1.577 6.697 152527.3 0.010 2.159 (language phylogeny effect)
fydd -0.440 1.156 52327.0 0.282 2.000 (+ effect of close societies, - distance)
settype -0.394 3.262 291235145.9 0.071 1.709 (- effect of more permanent settlements)
cultints 0.659 4.036 4876578.4 0.045 1.974 (+ effect of intensity of cultivation)
roots -2.608 5.080 3291170.3 0.024 1.324 (- effect of gathering roots)
foodtrade 0.087 4.300 6668341.2 0.038 1.181 (+ effect of more trade for food)
exogamy -0.970 6.930 13212783.5 0.008 1.142 (+ endogamy effect, - exogamy)
femsubs 0.818 6.675 4301594.7 0.010 1.362 (+ female contribution to subsistence effect)
fish 0.581 5.621 29623758.1 0.018 1.273
r2
R2:final model R2:IV(distance) R2:IV(language)
0.1799310 0.9244904 0.9663661
ccc
Fstat df pvalue
RESET 1.356 4376804.12 0.244 (Non-signif specification error of regression)
F on restrs. 0.434 11241.09 0.510 (Non-signif that dropped variables coef. > 0)
NCV 0.495 1952208.21 0.482 (Non-signif residual error heteroskedasticity)
SWnormal 1.455 1355722.96 0.228 (Non-signif residual error departure from normality)
lagll 1.874 3118786.56 0.171 (Non-signif autocorrelated language residuals)
lagdd 2.505 347263.62 0.113 (Non-signif autocorrelated distance residuals)
[edit] 3
Anthon Eff: The R2:IV(distance) R2:IV(language) are the goodness of fit of the instrumental variables for each of the stage 1 regressions -- the regressions creating the (IV) instrumental variables for the network-lagged dependent variables. These R2's are not usually reported, but I think they are useful to know--just to see how well the instrument fits the original (endogenous) network-lagged dependent variable.
Two things are done with regression models: 1) test hypotheses by performing inferences on the estimated parameters; 2) produce fitted values (as is done in a forecast). The regression in which the instrumental variable (IV) is created falls into the second category: we seek only the fitted value, and we have no intention of performing inferences on the estimated coefficients. The model in fact is not specified in a way that we can trust our inferences—it is atheoretical, where we throw a bunch of exogenous independent variables together just to get a fitted value of Wy that is both a good fit and is exogenous. For this reason, no one reports the estimated coefficients —- to do so would simply clutter up a paper with numbers that don’t really mean anything.
On the other hand, it makes sense to have some indication of the goodness of fit of the instrumental variable (IV) to the original Wy. Therefore I added to Program 2 code that will calculate the R^2 of the IV regressions.
[edit] Program 2 (9-19-2009) obsolete
#September 17, 2009, Anthon Eff #MI--estimate model, combine results rm(list=ls(all=TRUE)) #--Set path to your directory with data and program-- #setwd("d:/projects/MI") setwd("c:/My Documents/MI") options(echo=TRUE) #--need these packages for estimation and diagnostics-- library(foreign) library(spdep) library(car) library(lmtest) library(sandwich) #----------------------------- #--Read in data, rearrange---- #----------------------------- #--Read in original SCCS data--- load(url("http://frank.mtsu.edu/~eaeff/downloads/SCCS.Rdata"),.GlobalEnv) #--Read in two weight matrices-- ###ll<-as.matrix(read.dta("http://frank.mtsu.edu/~eaeff/downloads/langwm.dta")[,-1]) ###changed to ll<-as.matrix(read.dta("langwm.dta")[,-1]) ###dd<-as.matrix(read.dta("http://frank.mtsu.edu/~eaeff/downloads/dist25wm.dta")[,c(-1,-2,-189)]) ###changed to dd<-as.matrix(read.dta("dist25wm.dta")[,c(-1,-2,-189)]) #--Read in the imputed dataset--- load(url("http://frank.mtsu.edu/~eaeff/downloads/impdat.Rdata"),.GlobalEnv) #--create dep.varb. you wish to use from SCCS data-- #--Here we sum variables measuring how much a society values children-- depvar<-apply(SCCS[,c("v473","v474","v475","v476")],1,sum) #can replace "sum" with "max" #--find obs. for which dep. varb. is non-missing-- zdv<-which(!is.na(depvar)) depvar<-depvar[zdv] #--look at histogram and frequencies for the dep. varb.-- hist(depvar) table(depvar) #--modify weight matrices--- #--set diagonal equal to zeros-- diag(ll)<-0 diag(dd)<-0 #--use only obs. where dep. varb. non-missing-- ll<-ll[zdv,zdv] dd<-dd[zdv,zdv] #--row standardize (rows sum to one) ll<-ll/rowSums(ll) dd<-dd/rowSums(dd) #--check to see that rows sum to one rowSums(ll) rowSums(dd) #--make weight matrix object for later autocorrelation test-- wmatll<-mat2listw(as.matrix(ll)) wmatdd<-mat2listw(as.matrix(dd)) indpv<-c("femsubs","foodscarc", "exogamy","ncmallow","superjh","moralgods","fempower","sexratio", "war","himilexp","wagelabor", "famsize","settype","localjh","money", "cultints","roots","cereals","gath","hunt","fish", "anim","pigs","milk","plow","bovines","tree","foodtrade", "ndrymonth","ecorich", "popdens","pathstress","CVrain","rain","temp","AP1","AP2") #----------------------------------------------------- #---Estimate model on each imputed dataset------------ #----------------------------------------------------- #--number of imputed datasets-- nimp<-10 #--will append values to these empty objects-- vif<-NULL ss<-NULL beta<-NULL dng<-NULL #--loop through the imputed datasets-- for (i in 1:nimp){ #--select the ith imputed dataset-- m9<-impdat[which(impdat$.imp==i),] #--retain only obs. for which dep. varb. is nonmissing-- m9<-m9[zdv,] #--create spatially lagged dep. varbs.-- y<-as.matrix(depvar) xx<-as.matrix(m9[,indpv]) #--for instruments we use the spatial lag of our indep. varbs.-- #--First, the spatially lagged varb. for distance-- xdy<-dd%*%xx cy<-dd%*%y o<-lm(cy~xdy) #--the fitted value is our instrumental variable-- fydd<-fitted(o) #--keep R2 from this regression-- dr2<-summary(o)$r.squared #--Then, the spatially lagged varb. for language-- xdy<-ll%*%xx cy<-ll%*%y o<-lm(cy~xdy) #--the fitted value is our instrumental variable-- fyll<-fitted(o) #--keep R2 from this regression-- lr2<-summary(o)$r.squared m9<-cbind(m9,fydd,fyll) #--OLS estimate of unrestricted model-- xUR<-lm(depvar~fyll+fydd+ cultints+roots+cereals+gath+plow+ hunt+fish+anim+pigs+milk+bovines+tree+foodtrade+foodscarc+ ecorich+popdens+pathstress+exogamy+ncmallow+famsize+ settype+localjh+superjh+moralgods+fempower+femsubs+ sexratio+war+himilexp+money+wagelabor ,data=m9) #--OLS estimate of restricted model-- xR<-lm(depvar~fyll+fydd+settype+cultints+ roots+foodtrade+exogamy+ femsubs+fish, data=m9) #--collect coefficients and their variances-- ov<-summary(xR) vif<-rbind(vif,vif(xR)) ss<-rbind(ss,diag(ov$cov*ov$sigma^2)) #--collect robust coef. variances when there is heteroskedasticity-- #ss<-rbind(ss,diag(hccm(xR))) beta<-rbind(beta,coef(xR)) #--collect some model diagnostics-- dropt<-c("cereals","gath","plow","hunt","anim", "pigs","milk","bovines","foodscarc","ecorich", "popdens","pathstress","ncmallow","famsize","localjh", "superjh","moralgods","fempower","sexratio","money", "wagelabor","war","himilexp","tree") #--Ramsey RESET test-- p1<-qchisq(resettest(xR,type="fitted")$"p.value",1,lower.tail=FALSE) #--F test that dropped variables had coefficient equal zero-- o<-linear.hypothesis(xUR,dropt,white.adjust=TRUE)$"Pr(>F)"[2] p2<-qchisq(o,1,lower.tail=FALSE) #--Heteroskedasticity test (H0: homoskedastic residuals)-- p3<-ncv.test(xR)$ChiSquare #--Shapiro-Wilke normality test (H0: residuals normal) p4<-qchisq(shapiro.test(residuals(xR))$p.value,1,lower.tail=FALSE) #--LaGrange Multiplier test for spatial autocorrelation: language-- o<-lm.LMtests(xR, wmatll, test=c("LMlag")) p5<-as.numeric(o$LMlag$statistic) #--LaGrange Multiplier test for spatial autocorrelation: distance-- o<-lm.LMtests(xR, wmatdd, test=c("LMlag")) p6<-as.numeric(o$LMlag$statistic) #--model R2-- p7<-ov$r.squared dng<-rbind(dng,cbind(p1,p2,p3,p4,p5,p6,p7,dr2,lr2)) } #-------------------------------------------- #--Rubin's formulas for combining estimates-- #-------------------------------------------- #--first find final regr. coefs. and p-values-- mnb<-apply(beta,2,mean) vrb<-colSums((beta-t(matrix(mnb,length(mnb),10)))^2)/(nimp-1) mnv<-apply(ss,2,mean) vrT<-mnv+vrb*(1-nimp^(-1)) fst<-mnb^2/vrT r<-(1+nimp^(-1))*vrb/mnv v<-(nimp-1)*(1+r^(-1))^2 pval<-pf(fst,1,v,lower.tail=FALSE) bbb<-data.frame(round(cbind(mnb,fst,v,pval),3)) bbb$VIF[2:NROW(bbb)]<-round(apply(vif,2,mean),3) names(bbb)<-c("coef","Fstat","ddf","pvalue","VIF") #--Then combine the diagnostics we collected-- dng<-data.frame(dng) names(dng)<-c("RESET","F on restrs.","NCV","SWnormal","lagll","lagdd", "R2:final model","R2:IV(distance)","R2:IV(language)") r2<-apply(dng[,7:9],2,mean) adng<-dng[,1:6] mdm<-apply(adng,2,mean) vrd<-colSums((adng-t(matrix(mdm,length(mdm),nimp)))^2)/(nimp-1) aa<-4*mdm^2-2*vrd aa[which(aa<0)]<-0 rd<-(1+nimp^(-1))*vrd/(2*mdm+aa^.5) vd<-(nimp-1)*(1+rd^(-1))^2 Dm<-(mdm-(nimp-1)/(nimp+1)*rd)/(1+rd) #-All chi-sq we collected have df=1------- pvald<-pf(Dm,1,vd,lower.tail=FALSE) ccc<-data.frame(round(cbind(Dm,vd,pvald),3)) names(ccc)<-c("Fstat","df","pvalue") bbb r2 ccc #--write results to csv file for perusal in spreadsheet-- write.csv("==OLS model for depvar==",file="OLSresults.csv",append=FALSE) write.csv(bbb,file="OLSresults.csv",append=TRUE) write.csv(r2,file="OLSresults.csv",append=TRUE) write.csv(ccc,file="OLSresults.csv",append=TRUE)
[edit] Program 2 (9-22-2009) current
#MI--estimate model with network-lagged dependent variables, combine results
rm(list=ls(all=TRUE))
#--Set path to your directory with data and program--
###setwd("d:/projects/MI")###setwd("d:/projects/MI")
setwd("C:/My Documents/MI")
options(echo=TRUE)
#--need these packages for estimation and diagnostics--
library(foreign)
library(spdep)
library(car)
library(lmtest)
library(sandwich)
#-----------------------------
#--Read in data, rearrange----
#-----------------------------
#--Read in original SCCS data---
load("SCCS.Rdata",.GlobalEnv)
#--Read in two weight matrices--
ll<-as.matrix(read.dta("langwm.dta")[,-1])
dd<-as.matrix(read.dta("dist25wm.dta")[,c(-1,-2,-189)])
#--Read in the imputed dataset---
load("impdat.Rdata",.GlobalEnv)
#--create dep.varb. you wish to use from SCCS data--
#--Here we sum variables measuring how much a society values children--
#--can replace "sum" with "max"
childvalue<-apply(SCCS[,c("v473","v474","v475","v476")],1,sum)
#--find obs. for which dep. varb. is non-missing--
zdv<-which(!is.na(childvalue))
childvalue<-childvalue[zdv]
#--look at frequencies and quartiles for the dep. varb.--
summary(childvalue)
table(childvalue)
#--modify weight matrices---
#--set diagonal equal to zeros--
diag(ll)<-0
diag(dd)<-0
#--use only obs. where dep. varb. non-missing--
ll<-ll[zdv,zdv]
dd<-dd[zdv,zdv]
#--row standardize (rows sum to one)
ll<-ll/rowSums(ll)
dd<-dd/rowSums(dd)
#--make weight matrix object for later autocorrelation test--
wmatll<-mat2listw(as.matrix(ll))
wmatdd<-mat2listw(as.matrix(dd))
indpv<-c("femsubs","foodscarc","exogamy","ncmallow","superjh","moralgods",
"fempower","sexratio","war","himilexp","wagelabor","famsize","settype",
"localjh","money","cultints","roots","cereals","gath","hunt","fish",
"anim","pigs","milk","plow","bovines","tree","foodtrade",
"ndrymonth","ecorich","popdens","pathstress","CVrain","rain",
"temp","AP1","AP2","migr","brideprice","nuclearfam","pctFemPolyg")
#-----------------------------------------------------
#---Estimate model on each imputed dataset------------
#-----------------------------------------------------
#--number of imputed datasets--
nimp<-10
#--will append values to these empty objects--
vif<-NULL
ss<-NULL
beta<-NULL
dng<-NULL
#--loop through the imputed datasets--
for (i in 1:nimp){
#--select the ith imputed dataset--
m9<-impdat[which(impdat$.imp==i),]
#--retain only obs. for which dep. varb. is nonmissing--
m9<-m9[zdv,]
#--create spatially lagged dep. varbs.--
y<-as.matrix(childvalue)
xx<-as.matrix(m9[,indpv])
#--for instruments we use the spatial lag of our indep. varbs.--
#--First, the spatially lagged varb. for distance--
xdy<-dd%*%xx
cyd<-dd%*%y
o<-lm(cyd~xdy)
#--the fitted value is our instrumental variable--
fydd<-fitted(o)
#--keep R2 from this regression--
dr2<-summary(o)$r.squared
#--Then, the spatially lagged varb. for language--
xly<-ll%*%xx
cyl<-ll%*%y
o<-lm(cyl~xly)
#--the fitted value is our instrumental variable--
fyll<-fitted(o)
#--keep R2 from this regression--
lr2<-summary(o)$r.squared
m9<-cbind(m9,fydd,fyll)
#--OLS estimate of unrestricted model--
xUR<-lm(childvalue~fyll+fydd+
cultints+roots+cereals+gath+plow+
hunt+fish+anim+pigs+milk+bovines+tree+foodtrade+foodscarc+
ecorich+popdens+pathstress+exogamy+ncmallow+famsize+
settype+localjh+superjh+moralgods+fempower+femsubs+
sexratio+war+himilexp+money+wagelabor+
migr+brideprice+nuclearfam+pctFemPolyg
,data=m9)
#--OLS estimate of restricted model--
xR<-lm(childvalue ~ fyll + cultints + roots + fish +
exogamy + settype + femsubs, data = m9)
#--corrected sigma2 and R2 for 2SLS--
qxx<-m9
qxx[,"fydd"]<-cyd
qxx[,"fyll"]<-cyl
b<-coef(xR)
incpt<-matrix(1,NROW(qxx),1)
x<-as.matrix(cbind(incpt,qxx[,names(b)[-1]]))
e<-y-x%*%as.matrix(b)
cs2<-as.numeric(t(e)%*%e/(NROW(x)-NCOL(x)))
cr2<-as.numeric(1-t(e)%*%e/sum((y-mean(y))^2))
#--collect coefficients and their variances--
ov<-summary(xR)
vif<-rbind(vif,vif(xR))
ss<-rbind(ss,diag(ov$cov*cs2))
#--collect robust coef. variances when there is heteroskedasticity--
#eb<-e^2
#x<-as.matrix(cbind(incpt,m9[,names(b)[-1]]))
#hcm<-inv(t(x)%*%x)%*%t(x)%*%diag(eb[1:length(eb)])%*%x%*%inv(t(x)%*%x)
#ss<-rbind(ss,diag(hcm))
beta<-rbind(beta,coef(xR))
#--collect some model diagnostics--
dropt<-c("cereals","gath","plow","hunt","anim",
"pigs","milk","bovines","foodscarc","ecorich",
"popdens","pathstress","ncmallow","famsize","localjh",
"superjh","moralgods","fempower","sexratio","money",
"fydd","wagelabor","war","himilexp","tree","foodtrade")
#--Ramsey RESET test--
p1<-qchisq(resettest(xR,type="fitted")$"p.value",1,lower.tail=FALSE)
#--Wald test (H0: dropped variables have coefficient equal zero)--
o<-linear.hypothesis(xUR,dropt,test="Chisq")$"Pr(>Chisq)"[2]
p2<-qchisq(o,1,lower.tail=FALSE)
#--Heteroskedasticity test (H0: homoskedastic residuals)--
p3<-ncv.test(xR)$ChiSquare
#--Shapiro-Wilke normality test (H0: residuals normal)
p4<-qchisq(shapiro.test(residuals(xR))$p.value,1,lower.tail=FALSE)
#--LaGrange Multiplier test for spatial autocorrelation: language--
o<-lm.LMtests(xR, wmatll, test=c("LMlag"))
p5<-as.numeric(o$LMlag$statistic)
#--LaGrange Multiplier test for spatial autocorrelation: distance--
o<-lm.LMtests(xR, wmatdd, test=c("LMlag"))
p6<-as.numeric(o$LMlag$statistic)
#--model R2--
p7<-cr2
dng<-rbind(dng,cbind(p1,p2,p3,p4,p5,p6,p7,dr2,lr2))
}
#--------------------------------------------
#--Rubin's formulas for combining estimates--
#--------------------------------------------
#--first find final regr. coefs. and p-values--
mnb<-apply(beta,2,mean)
vrb<-colSums((beta-t(matrix(mnb,length(mnb),10)))^2)/(nimp-1)
mnv<-apply(ss,2,mean)
vrT<-mnv+vrb*(1-nimp^(-1))
fst<-mnb^2/vrT
r<-(1+nimp^(-1))*vrb/mnv
v<-(nimp-1)*(1+r^(-1))^2
pval<-pf(fst,1,v,lower.tail=FALSE)
bbb<-data.frame(round(cbind(mnb,fst,v,pval),3))
bbb$VIF[2:NROW(bbb)]<-round(apply(vif,2,mean),3)
names(bbb)<-c("coef","Fstat","ddf","pvalue","VIF")
#--Then combine the diagnostics we collected--
dng<-data.frame(dng)
names(dng)<-c("RESET","Wald on restrs.","NCV","SWnormal","lagll","lagdd",
"R2:final model","R2:IV(distance)","R2:IV(language)")
r2<-apply(dng[,7:9],2,mean)
adng<-dng[,1:6]
mdm<-apply(adng,2,mean)
vrd<-colSums((adng-t(matrix(mdm,length(mdm),nimp)))^2)/(nimp-1)
aa<-4*mdm^2-2*vrd
aa[which(aa<0)]<-0
rd<-(1+nimp^(-1))*vrd/(2*mdm+aa^.5)
vd<-(nimp-1)*(1+rd^(-1))^2
Dm<-(mdm-(nimp-1)/(nimp+1)*rd)/(1+rd)
#-All chi-sq we collected have df=1-------
pvald<-pf(Dm,1,vd,lower.tail=FALSE)
ccc<-data.frame(round(cbind(Dm,vd,pvald),3))
names(ccc)<-c("Fstat","df","pvalue")
bbb
r2
ccc
#--write results to csv file for perusal in spreadsheet--
write.csv("==OLS model for childvalue==",file="OLSresults.csv",append=FALSE)
write.csv(bbb,file="OLSresults.csv",append=TRUE)
write.csv(r2,file="OLSresults.csv",append=TRUE)
write.csv(ccc,file="OLSresults.csv",append=TRUE)
