EduMod 1: Max(valchild) Imputation and Regression

From InterSciWiki
Jump to: navigation, search

EduMod Author(s): User:Douglas R. White, Anthon Eff: Max(valchild) as dependent variables

Contents

A| Revised Eff and Dow Program Nov 6, 2009 (Restricted Model)

There was an error in xUR<- (two ++ in a row)

hunt+fish+anim+pigs+milk+bovines+tree+foodtrade+foodscarc+
ecorich+popdens+pathstress+exogamy+ncmallow+famsize+
#Program 1 of 2 (Each can be run separately)
#MI--make the imputed datasets
#--change the following path to the directory with your data and program--
setwd("C:/My Documents/MI")

rm(list=ls(all=TRUE))
options(echo=TRUE)
#--you need the following two packages--you must install them first--
library(foreign)
library(mice)

#--To find the citation for a package, use this function:---
citation("mice")

#-----------------------------
#--Read in data, rearrange----
#-----------------------------

#--Read in auxiliary variables---
load("vaux.Rdata",.GlobalEnv)
row.names(vaux)<-NULL
#--Read in the SCCS dataset---
load("SCCS.Rdata",.GlobalEnv)

#--look at first 6 rows of vaux--
head(vaux)
#--look at field names of vaux--
names(vaux)
#--check to see that rows are properly aligned in the two datasets--
#--sum should equal 186---
sum((SCCS$socname==vaux$socname)*1)
#--remove the society name field--
vaux<-vaux[,-28]
names(vaux)

#--Two nominal variables: brg and rlg----
#--brg: consolidated Burton  Regions-----
#0 = (rest of world) circumpolar, South and Meso-America, west North America
#1 = Subsaharan Africa
#2 = Middle Old World
#3 = Southeast Asia, Insular Pacific, Sahul
#4 = Eastern Americas
#--rlg: Religion---
#'0 (no world religion)'  
#'1 (Christianity)'  
#'2 (Islam)'  
#'3 (Hindu/Buddhist)'  

#--check to see number of missing values in vaux, 
#--whether variables are numeric,
#--and number of discrete values for each variable---
vvn<-names(vaux)
pp<-NULL
for (i in 1:length(vvn)){
nmiss<-length(which(is.na(vaux[,vvn[i]])))
numeric<-is.numeric(vaux[,vvn[i]])
numDiscrVals<-length(table(vaux[,vvn[i]]))
pp<-rbind(pp,cbind(data.frame(numeric),nmiss,numDiscrVals))
}
row.names(pp)<-vvn
pp

#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--extract variables to be used from SCCS, put in dataframe fx--
fx<-data.frame(
socname=SCCS$socname,socID=SCCS$"sccs#",
valchild=(SCCS$v473+SCCS$v474+SCCS$v475+SCCS$v476),
cultints=SCCS$v232,roots=(SCCS$v233==5)*1,
cereals=(SCCS$v233==6)*1,gath=SCCS$v203,hunt=SCCS$v204,
fish=SCCS$v205,anim=SCCS$v206,femsubs=SCCS$v890,
pigs=(SCCS$v244==2)*1,milk=(SCCS$v245>1)*1,plow=(SCCS$v243>1)*1,
bovines=(SCCS$v244==7)*1,tree=(SCCS$v233==4)*1,
foodtrade=SCCS$v819,foodscarc=SCCS$v1685,
ecorich=SCCS$v857,popdens=SCCS$v156,pathstress=SCCS$v1260,
CVrain=SCCS$v1914/SCCS$v1913,rain=SCCS$v854,temp=SCCS$v855,
AP1=SCCS$v921,AP2=SCCS$v928,ndrymonth=SCCS$v196,
exogamy=SCCS$v72,ncmallow=SCCS$v227,famsize=SCCS$v80,
settype=SCCS$v234,localjh=(SCCS$v236-1),superjh=SCCS$v237,
moralgods=SCCS$v238,fempower=SCCS$v663,
sexratio=1+(SCCS$v1689>85)+(SCCS$v1689>115),
war=SCCS$v1648,himilexp=(SCCS$v899==1)*1,
money=SCCS$v155,wagelabor=SCCS$v1732,
migr=(SCCS$v677==2)*1,brideprice=(SCCS$v208==1)*1,
nuclearfam=(SCCS$v210<=3)*1,pctFemPolyg=SCCS$v872
)

#--look at first 6 rows of fx--
head(fx)

######---------MODIFICATIONS BELOW THIS LINE--------
#--check to see number of missing values--
#--also check whether numeric--
#--and provide some descriptive statistics--
vvn<-names(fx)
dstats<-NULL
for (i in 1:length(vvn)){
z<-which(!is.na(fx[,vvn[i]]))
notmiss<-length(z)
numeric<-is.numeric(fx[,vvn[i]])
min<-999
max<-999
mean<-999
sd<-999
if (numeric==TRUE){
min<-min(fx[z,vvn[i]])
max<-max(fx[z,vvn[i]])
mean<-mean(fx[z,vvn[i]])
sd<-sd(fx[z,vvn[i]])
}
CV<-sd/mean
dstats<-rbind(dstats,cbind(notmiss,round(cbind(min,max,mean,sd,CV),4),data.frame(numeric)))
}
row.names(dstats)<-vvn
dstats

#--write these to a csv file--
write.csv(dstats,file="descriptive.csv")


#--identify variables with missing values--
z<-which(dstats$notmiss<186)
zv1<-vvn[z]
zv1
#--identify variables with non-missing values--
z<-which(dstats$notmiss==186)
zv2<-vvn[z]
zv2
######---------MODIFICATIONS ABOVE THIS LINE--------


#-----------------------------
#----Multiple imputation------
#-----------------------------

#--number of imputed data sets to create--
nimp<-10
#--one at a time, loop through those variables with missing values--
for (i in 1:length(zv1)){
#--attach the imputand to the auxiliary data--
zxx<-data.frame(cbind(vaux,fx[,zv1[i]]))
#--in the following line, the imputation is done--
aqq<-complete(mice(zxx,maxit=100,m=nimp),action="long")
#--during first iteration of the loop, create dataframe impdat--
if (i==1){
impdat<-data.frame(aqq[,c(".id",".imp")])
}
#--the imputand is placed as a field in impdat and named--
impdat<-cbind(impdat,data.frame(aqq[,NCOL(zxx)]))
names(impdat)[NCOL(impdat)]<-zv1[i]
}

#--now the non-missing variables are attached to impdat--
gg<-NULL
for (i in 1:nimp){
gg<-rbind(gg,data.frame(fx[,zv2]))
}
impdat<-cbind(impdat,gg)

#--take a look at the top 6 and bottom 6 rows of impdat--
head(impdat)
tail(impdat)

#--impdat is saved as an R-format data file--
save(impdat,file="impdat.Rdata")




 



#Program 2 of 2  
#MI--estimate model with network-lagged dependent variables, combine results
rm(list=ls(all=TRUE))
#--Set path to your directory with data and program--
setwd("C:/My Documents/MI")
options(echo=TRUE)

#--need these packages for estimation and diagnostics--
library(foreign)
library(spdep)
library(car)
library(lmtest)
library(sandwich)

#-----------------------------
#--Read in data, rearrange----
#-----------------------------

#--Read in original SCCS data---
load("SCCS.Rdata",.GlobalEnv)
#--Read in two weight matrices--
ll<-as.matrix(read.dta("langwm.dta")[,-1])
dd<-as.matrix(read.dta("dist25wm.dta")[,c(-1,-2,-189)])
#--Read in the imputed dataset---
load("impdat.Rdata",.GlobalEnv)

#HERE YOU CHANGE HOW THE DEPENDENT VARIABLE IS COMPUTED FOR A NEW PROJECT
#--create dep.varb. you wish to use from SCCS data--
#--Here we sum variables measuring how much a society values children--
#--can replace "sum" with "max"
depvar<-apply(SCCS[,c("v473","v474","v475","v476")],1,sum) 
#--find obs. for which dep. varb. is non-missing--
zdv<-which(!is.na(depvar))
depvar<-depvar[zdv]
#HERE GIVE THE "NAME" OF THE DEPENDENT VARIABLE THAT IS COMPUTED
depvarname<-"child value"
#--can add additional SCCS variable, but only if it has no missing values---
dateobs<-SCCS$v838
dateobs<-dateobs[zdv]

#--look at frequencies and quartiles for the dep. varb.--
summary(depvar)
table(depvar)

#--modify weight matrices---
#--set diagonal equal to zeros--
diag(ll)<-0
diag(dd)<-0
#--use only obs. where dep. varb. non-missing--
ll<-ll[zdv,zdv]
dd<-dd[zdv,zdv]
#--row standardize (rows sum to one)
ll<-ll/rowSums(ll)
dd<-dd/rowSums(dd)
#--make weight matrix object for later autocorrelation test--
wmatll<-mat2listw(as.matrix(ll))
wmatdd<-mat2listw(as.matrix(dd))

#MODIFY THESE STATEMENTS FOR A NEW PROJECT
indpv<-c("femsubs","foodscarc","exogamy","ncmallow","superjh","moralgods",
"fempower","sexratio","war","himilexp","wagelabor","famsize","settype",
"localjh","money","cultints","roots","cereals","gath","hunt","fish",
"anim","pigs","milk","plow","bovines","tree","foodtrade",
"ndrymonth","ecorich","popdens","pathstress","CVrain","rain",
"temp","AP1","AP2","migr","brideprice","nuclearfam","pctFemPolyg")

#-----------------------------------------------------
#---Estimate model on each imputed dataset------------
#-----------------------------------------------------

#--number of imputed datasets--
nimp<-10

#--will append values to these empty objects--
vif<-NULL
ss<-NULL
beta<-NULL
dng<-NULL

#--loop through the imputed datasets--
for (i in 1:nimp){

#--select the ith imputed dataset--
m9<-impdat[which(impdat$.imp==i),]
#--retain only obs. for which dep. varb. is nonmissing--
m9<-m9[zdv,]

#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--create spatially lagged dep. varbs. in stage 1 OLS--
y<-as.matrix(depvar)
xx<-as.matrix(m9[,indpv])
#--for instruments we use the spatial lag of our indep. varbs.--
#--First, the spatially lagged varb. for distance--
xdy<-dd%*%xx
cyd<-dd%*%y
o<-lm(cyd~xdy)
#--the fitted value is our instrumental variable--
fydd<-fitted(o)
#--keep R2 from this regression--
dr2<-summary(o)$r.squared
#--Then, the spatially lagged varb. for language--
xly<-ll%*%xx   
cyl<-ll%*%y
o<-lm(cyl~xly)
#--the fitted value is our instrumental variable--
fyll<-fitted(o)
#--keep R2 from this regression--
lr2<-summary(o)$r.squared
m9<-cbind(m9,fydd,fyll)

#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--Stage 2 OLS estimate of unrestricted model--
xUR<-lm(depvar~fyll+fydd+dateobs+
cultints+roots+cereals+gath+plow+
hunt+fish+anim+pigs+milk+bovines+tree+foodtrade+foodscarc+
ecorich+popdens+pathstress+exogamy+ncmallow+famsize+
settype+localjh+superjh+moralgods+fempower+femsubs+
sexratio+war+himilexp+money+wagelabor+
migr+brideprice+nuclearfam+pctFemPolyg
,data=m9)

#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--Stage 2 OLS estimate of restricted model--
xR<-lm(depvar ~ fyll + cultints + roots + fish + 
    exogamy + settype + femsubs, data = m9)

#--corrected sigma2 and R2 for 2SLS--
qxx<-m9
qxx[,"fydd"]<-cyd
qxx[,"fyll"]<-cyl
b<-coef(xR)
incpt<-matrix(1,NROW(qxx),1)
x<-as.matrix(cbind(incpt,qxx[,names(b)[-1]]))
e<-y-x%*%as.matrix(b)
cs2<-as.numeric(t(e)%*%e/(NROW(x)-NCOL(x)))
cr2<-as.numeric(1-t(e)%*%e/sum((y-mean(y))^2))

#--collect coefficients and their variances--
ov<-summary(xR)
vif<-rbind(vif,vif(xR))
ss<-rbind(ss,diag(ov$cov*cs2))
#--collect robust coef. variances when there is heteroskedasticity--
#eb<-e^2
#x<-as.matrix(cbind(incpt,m9[,names(b)[-1]]))
#hcm<-inv(t(x)%*%x)%*%t(x)%*%diag(eb[1:length(eb)])%*%x%*%inv(t(x)%*%x)
#ss<-rbind(ss,diag(hcm))
beta<-rbind(beta,coef(xR))

#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--collect some model diagnostics--
dropt<-c("cereals","gath","plow","hunt","anim","dateobs",
"pigs","milk","bovines","foodscarc","ecorich",
"popdens","pathstress","ncmallow","famsize","localjh",
"superjh","moralgods","fempower","sexratio","money",
"fydd","wagelabor","war","himilexp","tree","foodtrade")


#--Ramsey RESET test--
p1<-qchisq(resettest(xR,type="fitted")$"p.value",1,lower.tail=FALSE)
#--Wald test (H0: dropped variables have coefficient equal zero)--
o<-linear.hypothesis(xUR,dropt,test="Chisq")$"Pr(>Chisq)"[2]
p2<-qchisq(o,1,lower.tail=FALSE) #find Chisq with 1 d.f. and same pvalue
#--Heteroskedasticity test (H0: homoskedastic residuals)--
p3<-ncv.test(xR)$ChiSquare
#--Shapiro-Wilke normality test (H0: residuals normal)
p4<-qchisq(shapiro.test(e)$p.value,1,lower.tail=FALSE)
#--LaGrange Multiplier test for spatial autocorrelation: language--
o<-lm.LMtests(xR, wmatll, test=c("LMlag"))
p5<-as.numeric(o$LMlag$statistic)
#--LaGrange Multiplier test for spatial autocorrelation: distance--
o<-lm.LMtests(xR, wmatdd, test=c("LMlag"))
p6<-as.numeric(o$LMlag$statistic)
#--model R2--
p7<-cr2
dng<-rbind(dng,cbind(p1,p2,p3,p4,p5,p6,p7,dr2,lr2))

}

#--------------------------------------------
#--Rubin's formulas for combining estimates--
#--------------------------------------------

#--first find final regr. coefs. and p-values--
mnb<-apply(beta,2,mean)
vrb<-colSums((beta-t(matrix(mnb,length(mnb),10)))^2)/(nimp-1)
mnv<-apply(ss,2,mean)
vrT<-mnv+vrb*(1-nimp^(-1))
fst<-mnb^2/vrT
r<-(1+nimp^(-1))*vrb/mnv
v<-(nimp-1)*(1+r^(-1))^2
pval<-pf(fst,1,v,lower.tail=FALSE)
bbb<-data.frame(round(cbind(mnb,fst,v,pval),3))
bbb$VIF[2:NROW(bbb)]<-round(apply(vif,2,mean),3)
names(bbb)<-c("coef","Fstat","ddf","pvalue","VIF")

#--Then combine the diagnostics we collected--
dng<-data.frame(dng)
names(dng)<-c("RESET","Wald on restrs.","NCV","SWnormal","lagll","lagdd",
"R2:final model","R2:IV(distance)","R2:IV(language)")
r2<-apply(dng[,7:9],2,mean)
adng<-dng[,1:6]
mdm<-apply(adng,2,mean)
vrd<-colSums((adng-t(matrix(mdm,length(mdm),nimp)))^2)/(nimp-1)
aa<-4*mdm^2-2*vrd
aa[which(aa<0)]<-0
rd<-(1+nimp^(-1))*vrd/(2*mdm+aa^.5)
vd<-(nimp-1)*(1+rd^(-1))^2
Dm<-(mdm-(nimp-1)/(nimp+1)*rd)/(1+rd)
#-All chi-sq we collected have df=1-------
pvald<-pf(Dm,1,vd,lower.tail=FALSE)
ccc<-data.frame(round(cbind(Dm,vd,pvald),3))
names(ccc)<-c("Fstat","df","pvalue")
nn=" Depvar: Values, "Freqs", N=, and depvarname"
aaa<-c(table(depvar), NROW(depvar),depvarname)
nn
aaa
bbb
r2
ccc

######---------MODIFICATIONS BELOW THIS LINE--------
#--write results to csv file for perusal in spreadsheet--
write.csv(paste("2SLS model for
",depvarname,sep=""),file="OLSresults.csv",append=FALSE)
write.csv(table(depvar),file="OLSresults.csv",append=TRUE)
write.csv(unclass(summary(depvar)),file="OLSresults.csv",append=TRUE)
write.csv(paste("N=",NROW(depvar),"; number
imputations=",nimp,sep=""),file="OLSresults.csv",append=TRUE)
write.csv(bbb,file="OLSresults.csv",append=TRUE)
write.csv(r2,file="OLSresults.csv",append=TRUE)
write.csv(ccc,file="OLSresults.csv",append=TRUE)

u<-cbind(data.frame(depvar),SCCS[zdv,"socname"])
names(u)<-c(depvarname,"socname")

write.csv(u,file="OLSresults.csv",append=TRUE)
######---------MODIFICATIONS ABOVE THIS LINE--------

Restricted Model Results Depvar Child Value

>  bbb
               coef Fstat        ddf pvalue   VIF
(Intercept) -10.119 0.816    2226584  0.366    NA
fyll          1.404 8.112    2164440  0.004 1.319
cultints      0.794 5.663  281412350  0.017 1.897
roots        -2.297 4.015  674657656  0.045 1.209
fish          0.578 5.301  280807638  0.021 1.239
exogamy      -0.976 6.580  260309296  0.010 1.133
settype      -0.450 4.015  215694150  0.045 1.684
femsubs       0.632 4.065 1141360091  0.044 1.241
>  r2
 R2:final model R2:IV(distance) R2:IV(language) 
      0.1065208       0.9247485       0.9664180 
>  ccc
                Fstat           df pvalue
RESET           0.682 2.516212e+06  0.409
Wald on restrs. 0.341 8.130710e+02  0.560
NCV             1.078 8.842909e+06  0.299
SWnormal        0.676 8.365628e+08  0.411
lagll           1.703 4.605342e+06  0.192
lagdd           3.412 7.581263e+07  0.065

A Revised Eff and Dow Program Nov 6, 2009 (Unrestricted Model)

#Program 1 of 2 (Each can be run separately)
#MI--make the imputed datasets
#--change the following path to the directory with your data and program--
setwd("C:/My Documents/MI")

rm(list=ls(all=TRUE))
options(echo=TRUE)
#--you need the following two packages--you must install them first--
library(foreign)
library(mice)

#--To find the citation for a package, use this function:---
citation("mice")

#-----------------------------
#--Read in data, rearrange----
#-----------------------------

#--Read in auxiliary variables---
load("vaux.Rdata",.GlobalEnv)
row.names(vaux)<-NULL
#--Read in the SCCS dataset---
load("SCCS.Rdata",.GlobalEnv)

#--look at first 6 rows of vaux--
head(vaux)
#--look at field names of vaux--
names(vaux)
#--check to see that rows are properly aligned in the two datasets--
#--sum should equal 186---
sum((SCCS$socname==vaux$socname)*1)
#--remove the society name field--
vaux<-vaux[,-28]
names(vaux)

#--Two nominal variables: brg and rlg----
#--brg: consolidated Burton  Regions-----
#0 = (rest of world) circumpolar, South and Meso-America, west North America
#1 = Subsaharan Africa
#2 = Middle Old World
#3 = Southeast Asia, Insular Pacific, Sahul
#4 = Eastern Americas
#--rlg: Religion---
#'0 (no world religion)'  
#'1 (Christianity)'  
#'2 (Islam)'  
#'3 (Hindu/Buddhist)'  

#--check to see number of missing values in vaux, 
#--whether variables are numeric,
#--and number of discrete values for each variable---
vvn<-names(vaux)
pp<-NULL
for (i in 1:length(vvn)){
nmiss<-length(which(is.na(vaux[,vvn[i]])))
numeric<-is.numeric(vaux[,vvn[i]])
numDiscrVals<-length(table(vaux[,vvn[i]]))
pp<-rbind(pp,cbind(data.frame(numeric),nmiss,numDiscrVals))
}
row.names(pp)<-vvn
pp

#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--extract variables to be used from SCCS, put in dataframe fx--
fx<-data.frame(
socname=SCCS$socname,socID=SCCS$"sccs#",
valchild=(SCCS$v473+SCCS$v474+SCCS$v475+SCCS$v476),
cultints=SCCS$v232,roots=(SCCS$v233==5)*1,
cereals=(SCCS$v233==6)*1,gath=SCCS$v203,hunt=SCCS$v204,
fish=SCCS$v205,anim=SCCS$v206,femsubs=SCCS$v890,
pigs=(SCCS$v244==2)*1,milk=(SCCS$v245>1)*1,plow=(SCCS$v243>1)*1,
bovines=(SCCS$v244==7)*1,tree=(SCCS$v233==4)*1,
foodtrade=SCCS$v819,foodscarc=SCCS$v1685,
ecorich=SCCS$v857,popdens=SCCS$v156,pathstress=SCCS$v1260,
CVrain=SCCS$v1914/SCCS$v1913,rain=SCCS$v854,temp=SCCS$v855,
AP1=SCCS$v921,AP2=SCCS$v928,ndrymonth=SCCS$v196,
exogamy=SCCS$v72,ncmallow=SCCS$v227,famsize=SCCS$v80,
settype=SCCS$v234,localjh=(SCCS$v236-1),superjh=SCCS$v237,
moralgods=SCCS$v238,fempower=SCCS$v663,
sexratio=1+(SCCS$v1689>85)+(SCCS$v1689>115),
war=SCCS$v1648,himilexp=(SCCS$v899==1)*1,
money=SCCS$v155,wagelabor=SCCS$v1732,
migr=(SCCS$v677==2)*1,brideprice=(SCCS$v208==1)*1,
nuclearfam=(SCCS$v210<=3)*1,pctFemPolyg=SCCS$v872
)

#--look at first 6 rows of fx--
head(fx)

######---------MODIFICATIONS BELOW THIS LINE--------
#--check to see number of missing values--
#--also check whether numeric--
#--and provide some descriptive statistics--
vvn<-names(fx)
dstats<-NULL
for (i in 1:length(vvn)){
z<-which(!is.na(fx[,vvn[i]]))
notmiss<-length(z)
numeric<-is.numeric(fx[,vvn[i]])
min<-999
max<-999
mean<-999
sd<-999
if (numeric==TRUE){
min<-min(fx[z,vvn[i]])
max<-max(fx[z,vvn[i]])
mean<-mean(fx[z,vvn[i]])
sd<-sd(fx[z,vvn[i]])
}
CV<-sd/mean
dstats<-rbind(dstats,cbind(notmiss,round(cbind(min,max,mean,sd,CV),4),data.frame(numeric)))
}
row.names(dstats)<-vvn
dstats

#--write these to a csv file--
write.csv(dstats,file="descriptive.csv")


#--identify variables with missing values--
z<-which(dstats$notmiss<186)
zv1<-vvn[z]
zv1
#--identify variables with non-missing values--
z<-which(dstats$notmiss==186)
zv2<-vvn[z]
zv2
######---------MODIFICATIONS ABOVE THIS LINE--------


#-----------------------------
#----Multiple imputation------
#-----------------------------

#--number of imputed data sets to create--
nimp<-10
#--one at a time, loop through those variables with missing values--
for (i in 1:length(zv1)){
#--attach the imputand to the auxiliary data--
zxx<-data.frame(cbind(vaux,fx[,zv1[i]]))
#--in the following line, the imputation is done--
aqq<-complete(mice(zxx,maxit=100,m=nimp),action="long")
#--during first iteration of the loop, create dataframe impdat--
if (i==1){
impdat<-data.frame(aqq[,c(".id",".imp")])
}
#--the imputand is placed as a field in impdat and named--
impdat<-cbind(impdat,data.frame(aqq[,NCOL(zxx)]))
names(impdat)[NCOL(impdat)]<-zv1[i]
}

#--now the non-missing variables are attached to impdat--
gg<-NULL
for (i in 1:nimp){
gg<-rbind(gg,data.frame(fx[,zv2]))
}
impdat<-cbind(impdat,gg)

#--take a look at the top 6 and bottom 6 rows of impdat--
head(impdat)
tail(impdat)

#--impdat is saved as an R-format data file--
save(impdat,file="impdat.Rdata")




 



#Program 2 of 2  
#MI--estimate model with network-lagged dependent variables, combine results
rm(list=ls(all=TRUE))
#--Set path to your directory with data and program--
setwd("C:/My Documents/MI")
options(echo=TRUE)

#--need these packages for estimation and diagnostics--
library(foreign)
library(spdep)
library(car)
library(lmtest)
library(sandwich)

#-----------------------------
#--Read in data, rearrange----
#-----------------------------

#--Read in original SCCS data---
load("SCCS.Rdata",.GlobalEnv)
#--Read in two weight matrices--
ll<-as.matrix(read.dta("langwm.dta")[,-1])
dd<-as.matrix(read.dta("dist25wm.dta")[,c(-1,-2,-189)])
#--Read in the imputed dataset---
load("impdat.Rdata",.GlobalEnv)

#HERE YOU CHANGE HOW THE DEPENDENT VARIABLE IS COMPUTED FOR A NEW PROJECT
#--create dep.varb. you wish to use from SCCS data--
#--Here we sum variables measuring how much a society values children--
#--can replace "sum" with "max"
depvar<-apply(SCCS[,c("v473","v474","v475","v476")],1,sum) 
#--find obs. for which dep. varb. is non-missing--
zdv<-which(!is.na(depvar))
depvar<-depvar[zdv]
#HERE GIVE THE "NAME" OF THE DEPENDENT VARIABLE THAT IS COMPUTED
depvarname<-"child value"
#--can add additional SCCS variable, but only if it has no missing values---
dateobs<-SCCS$v838
dateobs<-dateobs[zdv]

#--look at frequencies and quartiles for the dep. varb.--
summary(depvar)
table(depvar)

#--modify weight matrices---
#--set diagonal equal to zeros--
diag(ll)<-0
diag(dd)<-0
#--use only obs. where dep. varb. non-missing--
ll<-ll[zdv,zdv]
dd<-dd[zdv,zdv]
#--row standardize (rows sum to one)
ll<-ll/rowSums(ll)
dd<-dd/rowSums(dd)
#--make weight matrix object for later autocorrelation test--
wmatll<-mat2listw(as.matrix(ll))
wmatdd<-mat2listw(as.matrix(dd))

#MODIFY THESE STATEMENTS FOR A NEW PROJECT
indpv<-c("femsubs","foodscarc","exogamy","ncmallow","superjh","moralgods",
"fempower","sexratio","war","himilexp","wagelabor","famsize","settype",
"localjh","money","cultints","roots","cereals","gath","hunt","fish",
"anim","pigs","milk","plow","bovines","tree","foodtrade",
"ndrymonth","ecorich","popdens","pathstress","CVrain","rain",
"temp","AP1","AP2","migr","brideprice","nuclearfam","pctFemPolyg")

#-----------------------------------------------------
#---Estimate model on each imputed dataset------------
#-----------------------------------------------------

#--number of imputed datasets--
nimp<-10

#--will append values to these empty objects--
vif<-NULL
ss<-NULL
beta<-NULL
dng<-NULL

#--loop through the imputed datasets--
for (i in 1:nimp){

#--select the ith imputed dataset--
m9<-impdat[which(impdat$.imp==i),]
#--retain only obs. for which dep. varb. is nonmissing--
m9<-m9[zdv,]

#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--create spatially lagged dep. varbs. in stage 1 OLS--
y<-as.matrix(depvar)
xx<-as.matrix(m9[,indpv])
#--for instruments we use the spatial lag of our indep. varbs.--
#--First, the spatially lagged varb. for distance--
xdy<-dd%*%xx
cyd<-dd%*%y
o<-lm(cyd~xdy)
#--the fitted value is our instrumental variable--
fydd<-fitted(o)
#--keep R2 from this regression--
dr2<-summary(o)$r.squared
#--Then, the spatially lagged varb. for language--
xly<-ll%*%xx   
cyl<-ll%*%y
o<-lm(cyl~xly)
#--the fitted value is our instrumental variable--
fyll<-fitted(o)
#--keep R2 from this regression--
lr2<-summary(o)$r.squared
m9<-cbind(m9,fydd,fyll)

#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--Stage 2 OLS estimate of unrestricted model--
xUR<-lm(depvar~fyll+fydd+dateobs+
cultints+roots+cereals+gath+plow+
hunt+fish+anim+pigs+milk+bovines+tree+foodtrade+foodscarc+
ecorich+popdens+pathstress+exogamy+ncmallow+famsize+
settype+localjh+superjh+moralgods+fempower+femsubs+
sexratio+war+himilexp+money+wagelabor+
migr+brideprice+nuclearfam+pctFemPolyg
,data=m9)

#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--Stage 2 OLS estimate of restricted model--
xR<-lm(depvar ~ fyll + fydd+
cultints+roots+cereals+gath+plow+
hunt+fish+anim+pigs+milk+bovines+tree+foodtrade+foodscarc+
ecorich+popdens+pathstress+exogamy+ncmallow+famsize+
settype+localjh+superjh+moralgods+fempower+femsubs+
sexratio+war+himilexp+money+wagelabor+
migr+brideprice+nuclearfam+pctFemPolyg
,data=m9)
#1# cultints + roots + fish + 
#1# exogamy + settype + femsubs, data = m9)

#--corrected sigma2 and R2 for 2SLS--
qxx<-m9
qxx[,"fydd"]<-cyd
qxx[,"fyll"]<-cyl
b<-coef(xR)
incpt<-matrix(1,NROW(qxx),1)
x<-as.matrix(cbind(incpt,qxx[,names(b)[-1]]))
e<-y-x%*%as.matrix(b)
cs2<-as.numeric(t(e)%*%e/(NROW(x)-NCOL(x)))
cr2<-as.numeric(1-t(e)%*%e/sum((y-mean(y))^2))

#--collect coefficients and their variances--
ov<-summary(xR)
vif<-rbind(vif,vif(xR))
ss<-rbind(ss,diag(ov$cov*cs2))
#--collect robust coef. variances when there is heteroskedasticity--
#eb<-e^2
#x<-as.matrix(cbind(incpt,m9[,names(b)[-1]]))
#hcm<-inv(t(x)%*%x)%*%t(x)%*%diag(eb[1:length(eb)])%*%x%*%inv(t(x)%*%x)
#ss<-rbind(ss,diag(hcm))
beta<-rbind(beta,coef(xR))

#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--collect some model diagnostics--
dropt<-c("cereals","gath","plow","hunt","anim","dateobs",
"pigs","milk","bovines","foodscarc","ecorich",
"popdens","pathstress","ncmallow","famsize","localjh",
"superjh","moralgods","fempower","sexratio","money",
"fydd","wagelabor","war","himilexp","tree","foodtrade")


#--Ramsey RESET test--
p1<-qchisq(resettest(xR,type="fitted")$"p.value",1,lower.tail=FALSE)
#--Wald test (H0: dropped variables have coefficient equal zero)--
o<-linear.hypothesis(xUR,dropt,test="Chisq")$"Pr(>Chisq)"[2]
p2<-qchisq(o,1,lower.tail=FALSE) #find Chisq with 1 d.f. and same pvalue
#--Heteroskedasticity test (H0: homoskedastic residuals)--
p3<-ncv.test(xR)$ChiSquare
#--Shapiro-Wilke normality test (H0: residuals normal)
p4<-qchisq(shapiro.test(e)$p.value,1,lower.tail=FALSE)
#--LaGrange Multiplier test for spatial autocorrelation: language--
o<-lm.LMtests(xR, wmatll, test=c("LMlag"))
p5<-as.numeric(o$LMlag$statistic)
#--LaGrange Multiplier test for spatial autocorrelation: distance--
o<-lm.LMtests(xR, wmatdd, test=c("LMlag"))
p6<-as.numeric(o$LMlag$statistic)
#--model R2--
p7<-cr2
dng<-rbind(dng,cbind(p1,p2,p3,p4,p5,p6,p7,dr2,lr2))

}

#--------------------------------------------
#--Rubin's formulas for combining estimates--
#--------------------------------------------

#--first find final regr. coefs. and p-values--
mnb<-apply(beta,2,mean)
vrb<-colSums((beta-t(matrix(mnb,length(mnb),10)))^2)/(nimp-1)
mnv<-apply(ss,2,mean)
vrT<-mnv+vrb*(1-nimp^(-1))
fst<-mnb^2/vrT
r<-(1+nimp^(-1))*vrb/mnv
v<-(nimp-1)*(1+r^(-1))^2
pval<-pf(fst,1,v,lower.tail=FALSE)
bbb<-data.frame(round(cbind(mnb,fst,v,pval),3))
bbb$VIF[2:NROW(bbb)]<-round(apply(vif,2,mean),3)
names(bbb)<-c("coef","Fstat","ddf","pvalue","VIF")

#--Then combine the diagnostics we collected--
dng<-data.frame(dng)
names(dng)<-c("RESET","Wald on restrs.","NCV","SWnormal","lagll","lagdd",
"R2:final model","R2:IV(distance)","R2:IV(language)")
r2<-apply(dng[,7:9],2,mean)
adng<-dng[,1:6]
mdm<-apply(adng,2,mean)
vrd<-colSums((adng-t(matrix(mdm,length(mdm),nimp)))^2)/(nimp-1)
aa<-4*mdm^2-2*vrd
aa[which(aa<0)]<-0
rd<-(1+nimp^(-1))*vrd/(2*mdm+aa^.5)
vd<-(nimp-1)*(1+rd^(-1))^2
Dm<-(mdm-(nimp-1)/(nimp+1)*rd)/(1+rd)
#-All chi-sq we collected have df=1-------
pvald<-pf(Dm,1,vd,lower.tail=FALSE)
ccc<-data.frame(round(cbind(Dm,vd,pvald),3))
names(ccc)<-c("Fstat","df","pvalue")

nn=" Depvar: Values, 'Freqs', N=, and depvarname"
aaa<-c(table(depvar), NROW(depvar),depvarname)
nn
aaa
bbb
r2
ccc

######---------MODIFICATIONS BELOW THIS LINE--------
#--write results to csv file for perusal in spreadsheet--
write.csv(paste("2SLS model for
",depvarname,sep=""),file="OLSresults.csv",append=FALSE)
write.csv(table(depvar),file="OLSresults.csv",append=TRUE)
write.csv(unclass(summary(depvar)),file="OLSresults.csv",append=TRUE)
write.csv(paste("N=",NROW(depvar),"; number
imputations=",nimp,sep=""),file="OLSresults.csv",append=TRUE)
write.csv(bbb,file="OLSresults.csv",append=TRUE)
write.csv(r2,file="OLSresults.csv",append=TRUE)
write.csv(ccc,file="OLSresults.csv",append=TRUE)

u<-cbind(data.frame(depvar),SCCS[zdv,"socname"])
names(u)<-c(depvarname,"socname")

write.csv(u,file="OLSresults.csv",append=TRUE)
######---------MODIFICATIONS ABOVE THIS LINE--------

Unrestricted Model Depvar Child Value

 8  10  12  14  16  17  18  19  20  22  23  24  26  28  30  32  34  36     
 2   2   1   3   6   1  12   2  25  19   1  28  18  19   7  17   2   6 171 
" Depvar: Values | Freqs, and N="
>  aaa
           8            10            12            14            16            17            18            19 
         "2"           "2"           "1"           "3"           "6"           "1"          "12"           "2" 
          20            22            23            24            26            28            30            32 
        "25"          "19"           "1"          "28"          "18"          "19"           "7"          "17" 
          34            36                             
         "2"           "6"         "171" "child value" 
Unrestricted Model 
> bbb
              coef Fstat       ddf pvalue   VIF
(Intercept) -5.894 0.113  2212.372  0.737    NA
fyll         2.011 4.954  2814.957  0.026 4.242
fydd        -0.631 1.239 26292.315  0.266 3.557
cultints     1.057 3.553  8479.664  0.059 5.226
roots       -4.575 3.732 48226.830  0.053 5.109
cereals     -1.443 0.387 16997.525  0.534 7.452
gath        -0.449 0.798 76193.979  0.372 3.214
plow        -2.470 1.574 41357.112  0.210 3.137
hunt        -0.123 0.048 15013.323  0.826 5.360
fish         0.261 0.397  7808.221  0.529 3.282
anim        -0.108 0.048  6403.700  0.827 5.945
pigs         0.592 0.104  3852.221  0.748 2.217
milk        -1.582 0.738  6312.885  0.390 3.977
bovines      1.939 1.127  9588.305  0.288 4.286
tree        -4.908 2.650 47360.522  0.104 3.319
foodtrade    0.094 3.330 17599.575  0.068 1.605
foodscarc   -0.433 1.293  1156.262  0.256 1.275
ecorich     -0.175 0.138  1558.135  0.710 1.880
popdens     -0.352 0.438 21312.900  0.508 3.947
pathstress  -0.110 0.345 21714.404  0.557 2.832
exogamy     -0.928 4.434  9609.754  0.035 1.483
ncmallow    -0.142 0.391   423.395  0.532 1.703
famsize      0.280 0.302 16086.960  0.583 2.217
settype     -0.487 1.822 17575.486  0.177 4.277
localjh     -0.491 0.254 29934.981  0.614 1.872
superjh     -0.079 0.019  3993.422  0.891 2.742
moralgods    0.242 0.185   980.299  0.668 2.343
fempower     0.351 1.525   420.465  0.218 1.396
femsubs      0.958 6.088 16422.313  0.014 1.869
sexratio     0.180 0.042   979.687  0.837 1.416
war         -0.090 1.460  1387.955  0.227 1.405
himilexp     0.959 0.683   415.228  0.409 1.609
money        0.392 0.771  6314.898  0.380 2.410
wagelabor   -0.877 1.696   147.414  0.195 1.551
migr         0.453 0.148   143.261  0.701 1.511
brideprice  -1.015 0.650  1841.168  0.420 2.026
nuclearfam  -0.751 0.279  3020.962  0.597 2.337
pctFemPolyg  0.002 0.009   635.818  0.925 1.870
>  r2
 R2:final model R2:IV(distance) R2:IV(language) 
      0.2721369       0.9247485       0.9664180 
>  ccc
                 Fstat         df pvalue
RESET            2.760    258.756  0.098
Wald on restrs.  0.341    813.071  0.560
NCV             -0.003   6627.433  1.000
SWnormal         0.266   1138.087  0.606
lagll            3.022 311905.292  0.082
lagdd            4.333 359293.323  0.037
Personal tools