Depvar percent polygyny v827 R2=.40

From InterSciWiki

Jump to: navigation, search

Contents

[edit] 1

Back to Depvars. This is a Depvar series where Doug 08:02, 13 October 2009 (PDT) is experimenting with changing the dependent variable for different ways of measuring polygyny.

[edit] B| Results for polygyny=v860 r2=.50

These are results from polygyny v860 R2=.50

Still, Wald test shows omitted variable(s) Wald test H0: that the true values of these coefficients of dropped variables equal zero (none significant) p14 Eff and Dow 2009.

Ok, but the "dropt" variable list needs to include ALL THE OMITTED VARIABLED as in subheading 2A| 2.3. And lets at this point eliminate "plow"

>  bbb
1	2SLS model for polygyny (depvar)
               coef  Fstat       ddf pvalue   VIF
(Intercept)   1.460  1.198 52027.088  0.274    NA
fyll         -0.285  0.384 11482.367  0.535 2.192
fydd          0.721 14.820  1449.677  0.000 2.531
plow         -0.163  0.330  4641.687  0.566 2.018
anim          0.059  1.008   899.793  0.316 2.582
milk         -0.528  3.661  3333.265  0.056 2.771 <-- milk significant but high VIF
foodtrade    -0.013  2.553  6786.690  0.110 1.166
ecorich      -0.204  3.887   300.740  0.050 2.570
fish         -0.119  5.537  7023.152  0.019 1.450
climate       0.514  8.906   753.015  0.003 2.468
plunder      -0.625 11.515   783.265  0.001 1.296
fratgrpstr    0.264  7.442    53.750  0.009 1.962
marrcaptives  0.329 10.381   210.246  0.001 1.388
smlpop        0.245  1.244   813.703  0.265 1.934
>  r2
 R2:final model R2:IV(distance) R2:IV(language) 
      0.4963125       0.9866688       0.9812159 
>  ccc
                 Fstat         df pvalue
RESET            1.984    536.228  0.160
Wald on restrs. 40.017    161.529  0.000 <-- Still omitted variable(s)
NCV              0.060   2135.191  0.807
SWnormal         0.179   5833.837  0.672
lagll            1.193 333238.710  0.275
lagdd            2.923 253018.766  0.087
>

[edit] A| substitute pctFemPolyg=SCCS$v872 for polygyny=v860

######(dropped in 4 places from independent variables)
Program 1 and 2 from Eff and Dow 2009
#MI--make the imputed datasets
#--change the following path to the directory with your data and program--
setwd("c:/My Documents/MI")
rm(list=ls(all=TRUE))
options(echo=TRUE)
#--you need the following two packages--you must install them first--
library(foreign)
library(mice)

#--To find the citation for a package, use this function:---
citation("mice")

#-----------------------------
#--Read in data, rearrange----
#-----------------------------

#--Read in auxiliary variables---
load("vaux.Rdata",.GlobalEnv)
row.names(vaux)<-NULL
#--Read in the SCCS dataset---
load("SCCS.Rdata",.GlobalEnv)

#--look at first 6 rows of vaux--
head(vaux)
#--look at field names of vaux--
names(vaux)
#--check to see that rows are properly aligned in the two datasets--
#--sum should equal 186---
sum((SCCS$socname==vaux$socname)*1)
#--remove the society name field--
vaux<-vaux[,-28]
names(vaux)

#--Two nominal variables: brg and rlg----
#--brg: consolidated Burton  Regions-----
#0 = (rest of world) circumpolar, South and Meso-America, west North America
#1 = Subsaharan Africa
#2 = Middle Old World
#3 = Southeast Asia, Insular Pacific, Sahul
#4 = Eastern Americas
#--rlg: Religion---
#'0 (no world religion)'  
#'1 (Christianity)'  
#'2 (Islam)'  
#'3 (Hindu/Buddhist)'  

#--check to see number of missing values in vaux, 
#--whether variables are numeric,
#--and number of discrete values for each variable---
vvn<-names(vaux)
pp<-NULL
for (i in 1:length(vvn)){
nmiss<-length(which(is.na(vaux[,vvn[i]])))
numeric<-is.numeric(vaux[,vvn[i]])
numDiscrVals<-length(table(vaux[,vvn[i]]))
pp<-rbind(pp,cbind(data.frame(numeric),nmiss,numDiscrVals))
}
row.names(pp)<-vvn
pp

#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--extract variables to be used from SCCS, put in dataframe fx--
fx<-data.frame(
socname=SCCS$socname,socID=SCCS$"sccs#",
valchild=(SCCS$v473+SCCS$v474+SCCS$v475+SCCS$v476),
cultints=SCCS$v232,roots=(SCCS$v233==5)*1,
cereals=(SCCS$v233==6)*1,gath=SCCS$v203,hunt=SCCS$v204,
fish=SCCS$v205,anim=SCCS$v206,femsubs=SCCS$v890,
pigs=(SCCS$v244==2)*1,milk=(SCCS$v245>1)*1,plow=(SCCS$v243>1)*1,
bovines=(SCCS$v244==7)*1,tree=(SCCS$v233==4)*1,
foodtrade=SCCS$v819,foodscarc=SCCS$v1685,
ecorich=SCCS$v857,popdens=SCCS$v156,pathstress=SCCS$v1260,
CVrain=SCCS$v1914/SCCS$v1913,rain=SCCS$v854,temp=SCCS$v855,
AP1=SCCS$v921,AP2=SCCS$v928,ndrymonth=SCCS$v196,
exogamy=SCCS$v72,ncmallow=SCCS$v227,famsize=SCCS$v80,
settype=SCCS$v234,localjh=(SCCS$v236-1),superjh=SCCS$v237,
moralgods=SCCS$v238,fempower=SCCS$v663,
sexratio=1+(SCCS$v1689>85)+(SCCS$v1689>115),
war=SCCS$v1648,himilexp=(SCCS$v899==1)*1,
money=SCCS$v155,wagelabor=SCCS$v1732,
migr=(SCCS$v677==2)*1,brideprice=(SCCS$v208==1)*1,
nuclearfam=(SCCS$v210<=3)*1######,pctFemPolyg=SCCS$v872##THIS IS NOW A DEPVAR not an indep var
,plunder=SCCS$v912,fratgrpstr=SCCS$v570,      ####ADDED three variables####
climate<-(SCCS$v857==1 | SCCS$v857==2 | SCCS$v857==6)*1+(SCCS$v857==3 | #####ADDED CLIMATE
SCCS$v857==4)*2+(SCCS$v857==5)*3                                        #####ADDED CLIMATE
#####--Should double-check everything                 
,cbind(SCCS$v857,climate),                                                #####ADDED CLIMATE
marrcaptives=SCCS$v870,smlpop=(SCCS$v1122<=3)*1####ADDED three variables####
)

#--look at first 6 rows of fx--
head(fx)

#--check to see number of missing values--
#--also check whether numeric--
vvn<-names(fx)
pp<-NULL
for (i in 1:length(vvn)){
nmiss<-length(which(is.na(fx[,vvn[i]])))
numeric<-is.numeric(fx[,vvn[i]])
pp<-rbind(pp,cbind(nmiss,data.frame(numeric)))
}
row.names(pp)<-vvn
pp

#--identify variables with missing values--
z<-which(pp[,1]>0)
zv1<-vvn[z]
zv1
#--identify variables with non-missing values--
z<-which(pp[,1]==0)
zv2<-vvn[z]
zv2

#-----------------------------
#----Multiple imputation------
#-----------------------------

#--number of imputed data sets to create--
nimp<-10
#--one at a time, loop through those variables with missing values--
for (i in 1:length(zv1)){
#--attach the imputand to the auxiliary data--
zxx<-data.frame(cbind(vaux,fx[,zv1[i]]))
#--in the following line, the imputation is done--
aqq<-complete(mice(zxx,maxit=100,m=nimp),action="long")
#--during first iteration of the loop, create dataframe impdat--
if (i==1){
impdat<-data.frame(aqq[,c(".id",".imp")])
}
#--the imputand is placed as a field in impdat and named--
impdat<-cbind(impdat,data.frame(aqq[,NCOL(zxx)]))
names(impdat)[NCOL(impdat)]<-zv1[i]
}

#--now the non-missing variables are attached to impdat--
gg<-NULL
for (i in 1:nimp){
gg<-rbind(gg,data.frame(fx[,zv2]))
}
impdat<-cbind(impdat,gg)

#--take a look at the top 6 and bottom 6 rows of impdat--
head(impdat)
tail(impdat)

#--impdat is saved as an R-format data file--
save(impdat,file="impdat.Rdata")


 
Program 2
#MI--estimate model with network-lagged dependent variables, combine results
rm(list=ls(all=TRUE))
#--Set path to your directory with data and program--
setwd("c:/My Documents/MI")
options(echo=TRUE)

#--need these packages for estimation and diagnostics--
library(foreign)
library(spdep)
library(car)
library(lmtest)
library(sandwich)

#-----------------------------
#--Read in data, rearrange----
#-----------------------------

#--Read in original SCCS data---
load("SCCS.Rdata",.GlobalEnv)
#--Read in two weight matrices--
ll<-as.matrix(read.dta("langwm.dta")[,-1])
dd<-as.matrix(read.dta("dist25wm.dta")[,c(-1,-2,-189)])
#--Read in the imputed dataset---
load("impdat.Rdata",.GlobalEnv)

#HERE YOU CHANGE HOW THE DEPENDENT VARIABLE IS COMPUTED FOR A NEW PROJECT
#--create dep.varb. you wish to use from SCCS data--
#--Here we sum variables measuring how much a society values children--
#--can replace "sum" with "max"
###depvar<-apply(SCCS[,c("v473","v474","v475","v476")],1,sum) 
   depvar<-SCCS$v872 ### CHANGED DEPENDENT VARIABLE TO "pctFemPolyg"
#--find obs. for which dep. varb. is non-missing--
zdv<-which(!is.na(depvar))
depvar<-depvar[zdv]
#HERE GIVE THE "NAME" OF THE DEPENDENT VARIABLE THAT IS COMPUTED
depvarname<-"pctFemPolyg"  ###"child value" CHANGED DEPVAR TO "pctFemPolyg"
#--can add additional SCCS variable, but only if it has no missing values---
dateobs<-SCCS$v838
dateobs<-dateobs[zdv]

#--look at frequencies and quartiles for the dep. varb.--
summary(depvar)
table(depvar)

#--modify weight matrices---
#--set diagonal equal to zeros--
diag(ll)<-0
diag(dd)<-0
#--use only obs. where dep. varb. non-missing--
ll<-ll[zdv,zdv]
dd<-dd[zdv,zdv]
#--row standardize (rows sum to one)
ll<-ll/rowSums(ll)
dd<-dd/rowSums(dd)
#--make weight matrix object for later autocorrelation test--
wmatll<-mat2listw(as.matrix(ll))
wmatdd<-mat2listw(as.matrix(dd))

#MODIFY THESE STATEMENTS FOR A NEW PROJECT
indpv<-c("femsubs","foodscarc","exogamy","ncmallow","superjh","moralgods",
"fempower","sexratio","war","himilexp","wagelabor","famsize","settype",
"localjh","money","cultints","roots","cereals","gath","hunt","fish",
"anim","pigs","milk","plow","bovines","tree","foodtrade",
"ndrymonth","ecorich","popdens","pathstress","CVrain","rain",
"fratgrpstr", "marrcaptives", "smlpop",####ADDED three variables
"temp","AP1","AP2","migr","brideprice","nuclearfam")######,"pctFemPolyg")

#-----------------------------------------------------
#---Estimate model on each imputed dataset------------
#-----------------------------------------------------

#--number of imputed datasets--
nimp<-10

#--will append values to these empty objects--
vif<-NULL
ss<-NULL
beta<-NULL
dng<-NULL

#--loop through the imputed datasets--
for (i in 1:nimp){

#--select the ith imputed dataset--
m9<-impdat[which(impdat$.imp==i),]
#--retain only obs. for which dep. varb. is nonmissing--
m9<-m9[zdv,]

#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--create spatially lagged dep. varbs. in stage 1 OLS--
y<-as.matrix(depvar)
xx<-as.matrix(m9[,indpv])
#--for instruments we use the spatial lag of our indep. varbs.--
#--First, the spatially lagged varb. for distance--
xdy<-dd%*%xx
cyd<-dd%*%y
o<-lm(cyd~xdy)
#--the fitted value is our instrumental variable--
fydd<-fitted(o)
#--keep R2 from this regression--
dr2<-summary(o)$r.squared
#--Then, the spatially lagged varb. for language--
xly<-ll%*%xx   
cyl<-ll%*%y
o<-lm(cyl~xly)
#--the fitted value is our instrumental variable--
fyll<-fitted(o)
#--keep R2 from this regression--
lr2<-summary(o)$r.squared
m9<-cbind(m9,fydd,fyll)

#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--Stage 2 OLS estimate of unrestricted model--
xUR<-lm(depvar~fyll+fydd+dateobs+
cultints+roots+cereals+gath+plow+
hunt+fish+anim+pigs+milk+bovines+tree+foodtrade+foodscarc+
+ecorich+popdens+pathstress+exogamy+ncmallow+famsize+
settype+localjh+superjh+moralgods+fempower+femsubs+
sexratio+war+himilexp+money+wagelabor+
migr+brideprice+nuclearfam######+pctFemPolyg
+climate+plunder+fratgrpstr+marrcaptives+smlpop####ADDED three variables PLUS CLIMATE
,data=m9)

#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--Stage 2 OLS estimate of restricted model--
###xR<-lm(depvar ~ fyll + cultints + roots + fish + 
###    exogamy + settype + femsubs, data = m9)
###REDEFINED xR in the next command: Only those significant
xR<-lm(depvar~fyll+fydd+###dateobs+ ###Take out this one variables
plow+anim+milk+foodtrade
+ecorich+fish####ADDED from existing independent variables
+climate+plunder+fratgrpstr+marrcaptives+smlpop####ADDED three variables PLUS CLIMATE
####+FratIntPres+plunder####newly defined v569, 912###ALSO newly added to INDEP VARS
##cultints+roots+cereals+gath+plow+
##hunt+fish+anim+pigs+milk+bovines+tree+foodtrade+foodscarc+
##+ecorich+popdens+pathstress+exogamy+ncmallow+famsize+
##settype+localjh+superjh+moralgods+fempower+femsubs+
##sexratio+war+himilexp+money+wagelabor+
##migr+brideprice+nuclearfam######+pctFemPolyg
,data=m9)


#--corrected sigma2 and R2 for 2SLS--
qxx<-m9
qxx[,"fydd"]<-cyd
qxx[,"fyll"]<-cyl
b<-coef(xR)
incpt<-matrix(1,NROW(qxx),1)
x<-as.matrix(cbind(incpt,qxx[,names(b)[-1]]))
e<-y-x%*%as.matrix(b)
cs2<-as.numeric(t(e)%*%e/(NROW(x)-NCOL(x)))
cr2<-as.numeric(1-t(e)%*%e/sum((y-mean(y))^2))

#--collect coefficients and their variances--
ov<-summary(xR)
vif<-rbind(vif,vif(xR))
ss<-rbind(ss,diag(ov$cov*cs2))
#--collect robust coef. variances when there is heteroskedasticity--
#eb<-e^2
#x<-as.matrix(cbind(incpt,m9[,names(b)[-1]]))
#hcm<-inv(t(x)%*%x)%*%t(x)%*%diag(eb[1:length(eb)])%*%x%*%inv(t(x)%*%x)
#ss<-rbind(ss,diag(hcm))
beta<-rbind(beta,coef(xR))

#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--collect some model diagnostics--
dropt<-c("cereals","gath","plow","hunt","anim","dateobs",
"pigs","milk","bovines","foodscarc","ecorich",
"popdens","pathstress","ncmallow","famsize","localjh",
"superjh","moralgods","fempower","sexratio","money",
"climate","plunder","fratgrpstr","marrcaptives","smlpop",####ADDED three variables PLUS CLIMATE
"fydd","wagelabor","war","himilexp","tree","foodtrade")


#--Ramsey RESET test--
p1<-qchisq(resettest(xR,type="fitted")$"p.value",1,lower.tail=FALSE)
#--Wald test (H0: dropped variables have coefficient equal zero)--
o<-linear.hypothesis(xUR,dropt,test="Chisq")$"Pr(>Chisq)"[2]
p2<-qchisq(o,1,lower.tail=FALSE) #find Chisq with 1 d.f. and same pvalue
#--Heteroskedasticity test (H0: homoskedastic residuals)--
p3<-ncv.test(xR)$ChiSquare
#--Shapiro-Wilke normality test (H0: residuals normal)
p4<-qchisq(shapiro.test(e)$p.value,1,lower.tail=FALSE)
#--LaGrange Multiplier test for spatial autocorrelation: language--
o<-lm.LMtests(xR, wmatll, test=c("LMlag"))
p5<-as.numeric(o$LMlag$statistic)
#--LaGrange Multiplier test for spatial autocorrelation: distance--
o<-lm.LMtests(xR, wmatdd, test=c("LMlag"))
p6<-as.numeric(o$LMlag$statistic)
#--model R2--
p7<-cr2
dng<-rbind(dng,cbind(p1,p2,p3,p4,p5,p6,p7,dr2,lr2))

}

#--------------------------------------------
#--Rubin's formulas for combining estimates--
#--------------------------------------------

#--first find final regr. coefs. and p-values--
mnb<-apply(beta,2,mean)
vrb<-colSums((beta-t(matrix(mnb,length(mnb),10)))^2)/(nimp-1)
mnv<-apply(ss,2,mean)
vrT<-mnv+vrb*(1-nimp^(-1))
fst<-mnb^2/vrT
r<-(1+nimp^(-1))*vrb/mnv
v<-(nimp-1)*(1+r^(-1))^2
pval<-pf(fst,1,v,lower.tail=FALSE)
bbb<-data.frame(round(cbind(mnb,fst,v,pval),3))
bbb$VIF[2:NROW(bbb)]<-round(apply(vif,2,mean),3)
names(bbb)<-c("coef","Fstat","ddf","pvalue","VIF")

#--Then combine the diagnostics we collected--
dng<-data.frame(dng)
names(dng)<-c("RESET","Wald on restrs.","NCV","SWnormal","lagll","lagdd",
"R2:final model","R2:IV(distance)","R2:IV(language)")
r2<-apply(dng[,7:9],2,mean)
adng<-dng[,1:6]
mdm<-apply(adng,2,mean)
vrd<-colSums((adng-t(matrix(mdm,length(mdm),nimp)))^2)/(nimp-1)
aa<-4*mdm^2-2*vrd
aa[which(aa<0)]<-0
rd<-(1+nimp^(-1))*vrd/(2*mdm+aa^.5)
vd<-(nimp-1)*(1+rd^(-1))^2
Dm<-(mdm-(nimp-1)/(nimp+1)*rd)/(1+rd)
#-All chi-sq we collected have df=1-------
pvald<-pf(Dm,1,vd,lower.tail=FALSE)
ccc<-data.frame(round(cbind(Dm,vd,pvald),3))
names(ccc)<-c("Fstat","df","pvalue")

bbb
r2
ccc
#Corrected to publication version with depvarname
#--write results to csv file for perusal in spreadsheet--
write.csv(paste("2SLS model for ",depvarname,sep=""),file="OLSresults.csv", append=FALSE)
write.csv(bbb,file="OLSresults.csv",append=TRUE)
write.csv(r2,file="OLSresults.csv",append=TRUE)
write.csv(ccc,file="OLSresults.csv",append=TRUE)


>  bbb
              coef  Fstat       ddf pvalue   VIF

(Intercept) -6.685 0.182 2907.569 0.670 NA fyll 0.593 1.600 36952.826 0.206 1.954 fydd 0.412 3.126 7220.881 0.077 2.428 plow -6.913 0.850 3400.677 0.357 2.218 anim 0.193 0.019 26798.345 0.891 2.590 milk -3.166 0.202 4752.296 0.653 2.846 foodtrade -0.017 0.008 4361.707 0.931 1.225 ecorich -4.516 2.340 2367.661 0.126 3.658 fish -4.706 14.131 21205.243 0.000 1.477 climate 9.942 3.989 863.506 0.046 3.289 plunder -6.783 2.167 879.830 0.141 1.330 fratgrpstr 4.589 4.563 171.737 0.034 1.926 marrcaptives 3.677 2.175 504.869 0.141 1.431 smlpop 10.155 3.777 1812.562 0.052 1.839 > r2

R2:final model R2:IV(distance) R2:IV(language) 
     0.4048324       0.9858379       0.9829404 

> ccc

                Fstat          df pvalue

RESET 2.687 1956.390 0.101 Wald on restrs. 8.264 363.213 0.004 NCV 15.146 2301.375 0.000 SWnormal 9.321 7744.546 0.002 lagll 0.091 1237474.986 0.763 lagdd 1.145 57685.536 0.285

[edit] B| Results for pctFemPolyg R2=.40

>  bbb
1	2SLS model for pctFemPolyg (polygyny depvar)
               coef  Fstat       ddf pvalue   VIF
(Intercept)  -6.400  0.169  4569.307  0.681    NA
fyll          0.601  1.602  6442.466  0.206 1.959
fydd          0.416  3.157 12501.373  0.076 2.455
plow         -6.984  0.877 28045.989  0.349 2.248
anim          0.237  0.028 11074.920  0.867 2.588
milk         -3.291  0.217  6628.079  0.642 2.869
foodtrade    -0.020  0.011 16451.304  0.916 1.221
ecorich      -4.556  2.339  1830.552  0.126 3.679
fish         -4.817 15.005 31693.269  0.000 1.455
climate       9.959  4.083  1606.577  0.043 3.290
plunder      -7.068  2.459  1951.552  0.117 1.304
fratgrpstr    4.529  4.344   258.370  0.038 2.002
marrcaptives  3.716  2.456 15590.921  0.117 1.415
smlpop       10.489  3.967  2714.303  0.047 1.882
>  r2
 R2:final model R2:IV(distance) R2:IV(language) 
      0.4022099       0.9859886       0.9828142 
>  ccc
                 Fstat          df pvalue
RESET            2.606    4155.811  0.107
Wald on restrs.  7.788     677.723  0.005
NCV             13.898    1150.388  0.000
SWnormal         9.893    4590.932  0.002
lagll            0.089 4037335.687  0.765
lagdd            1.168 1142869.805  0.280
>

[edit] A| substitute self-amplifying=SCCS$v876 for pctFemPolyg=v872 R2=.39

######(dropped in 4 places from independent variables)
Program 1 and 2 from Eff and Dow 2009
#MI--make the imputed datasets
#--change the following path to the directory with your data and program--
setwd("c:/My Documents/MI")
rm(list=ls(all=TRUE))
options(echo=TRUE)
#--you need the following two packages--you must install them first--
library(foreign)
library(mice)

#--To find the citation for a package, use this function:---
citation("mice")

#-----------------------------
#--Read in data, rearrange----
#-----------------------------

#--Read in auxiliary variables---
load("vaux.Rdata",.GlobalEnv)
row.names(vaux)<-NULL
#--Read in the SCCS dataset---
load("SCCS.Rdata",.GlobalEnv)

#--look at first 6 rows of vaux--
head(vaux)
#--look at field names of vaux--
names(vaux)
#--check to see that rows are properly aligned in the two datasets--
#--sum should equal 186---
sum((SCCS$socname==vaux$socname)*1)
#--remove the society name field--
vaux<-vaux[,-28]
names(vaux)

#--Two nominal variables: brg and rlg----
#--brg: consolidated Burton  Regions-----
#0 = (rest of world) circumpolar, South and Meso-America, west North America
#1 = Subsaharan Africa
#2 = Middle Old World
#3 = Southeast Asia, Insular Pacific, Sahul
#4 = Eastern Americas
#--rlg: Religion---
#'0 (no world religion)'  
#'1 (Christianity)'  
#'2 (Islam)'  
#'3 (Hindu/Buddhist)'  

#--check to see number of missing values in vaux, 
#--whether variables are numeric,
#--and number of discrete values for each variable---
vvn<-names(vaux)
pp<-NULL
for (i in 1:length(vvn)){
nmiss<-length(which(is.na(vaux[,vvn[i]])))
numeric<-is.numeric(vaux[,vvn[i]])
numDiscrVals<-length(table(vaux[,vvn[i]]))
pp<-rbind(pp,cbind(data.frame(numeric),nmiss,numDiscrVals))
}
row.names(pp)<-vvn
pp

#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--extract variables to be used from SCCS, put in dataframe fx--
fx<-data.frame(
socname=SCCS$socname,socID=SCCS$"sccs#",
valchild=(SCCS$v473+SCCS$v474+SCCS$v475+SCCS$v476),
cultints=SCCS$v232,roots=(SCCS$v233==5)*1,
cereals=(SCCS$v233==6)*1,gath=SCCS$v203,hunt=SCCS$v204,
fish=SCCS$v205,anim=SCCS$v206,femsubs=SCCS$v890,
pigs=(SCCS$v244==2)*1,milk=(SCCS$v245>1)*1,plow=(SCCS$v243>1)*1,
bovines=(SCCS$v244==7)*1,tree=(SCCS$v233==4)*1,
foodtrade=SCCS$v819,foodscarc=SCCS$v1685,
ecorich=SCCS$v857,popdens=SCCS$v156,pathstress=SCCS$v1260,
CVrain=SCCS$v1914/SCCS$v1913,rain=SCCS$v854,temp=SCCS$v855,
AP1=SCCS$v921,AP2=SCCS$v928,ndrymonth=SCCS$v196,
exogamy=SCCS$v72,ncmallow=SCCS$v227,famsize=SCCS$v80,
settype=SCCS$v234,localjh=(SCCS$v236-1),superjh=SCCS$v237,
moralgods=SCCS$v238,fempower=SCCS$v663,
sexratio=1+(SCCS$v1689>85)+(SCCS$v1689>115),
war=SCCS$v1648,himilexp=(SCCS$v899==1)*1,
money=SCCS$v155,wagelabor=SCCS$v1732,
migr=(SCCS$v677==2)*1,brideprice=(SCCS$v208==1)*1,
nuclearfam=(SCCS$v210<=3)*1
,plunder=SCCS$v912,fratgrpstr=SCCS$v570,      ####ADDED three variables####
climate<-(SCCS$v857==1 | SCCS$v857==2 | SCCS$v857==6)*1+(SCCS$v857==3 | #####ADDED CLIMATE
SCCS$v857==4)*2+(SCCS$v857==5)*3                                        #####ADDED CLIMATE
#####--Should double-check everything                 
,cbind(SCCS$v857,climate),                                                #####ADDED CLIMATE
marrcaptives=SCCS$v870,smlpop=(SCCS$v1122<=3)*1####ADDED three variables####
)

#--look at first 6 rows of fx--
head(fx)

#--check to see number of missing values--
#--also check whether numeric--
vvn<-names(fx)
pp<-NULL
for (i in 1:length(vvn)){
nmiss<-length(which(is.na(fx[,vvn[i]])))
numeric<-is.numeric(fx[,vvn[i]])
pp<-rbind(pp,cbind(nmiss,data.frame(numeric)))
}
row.names(pp)<-vvn
pp

#--identify variables with missing values--
z<-which(pp[,1]>0)
zv1<-vvn[z]
zv1
#--identify variables with non-missing values--
z<-which(pp[,1]==0)
zv2<-vvn[z]
zv2

#-----------------------------
#----Multiple imputation------
#-----------------------------

#--number of imputed data sets to create--
nimp<-10
#--one at a time, loop through those variables with missing values--
for (i in 1:length(zv1)){
#--attach the imputand to the auxiliary data--
zxx<-data.frame(cbind(vaux,fx[,zv1[i]]))
#--in the following line, the imputation is done--
aqq<-complete(mice(zxx,maxit=100,m=nimp),action="long")
#--during first iteration of the loop, create dataframe impdat--
if (i==1){
impdat<-data.frame(aqq[,c(".id",".imp")])
}
#--the imputand is placed as a field in impdat and named--
impdat<-cbind(impdat,data.frame(aqq[,NCOL(zxx)]))
names(impdat)[NCOL(impdat)]<-zv1[i]
}

#--now the non-missing variables are attached to impdat--
gg<-NULL
for (i in 1:nimp){
gg<-rbind(gg,data.frame(fx[,zv2]))
}
impdat<-cbind(impdat,gg)

#--take a look at the top 6 and bottom 6 rows of impdat--
head(impdat)
tail(impdat)

#--impdat is saved as an R-format data file--
save(impdat,file="impdat.Rdata")


 
Program 2
#MI--estimate model with network-lagged dependent variables, combine results
rm(list=ls(all=TRUE))
#--Set path to your directory with data and program--
setwd("c:/My Documents/MI")
options(echo=TRUE)

#--need these packages for estimation and diagnostics--
library(foreign)
library(spdep)
library(car)
library(lmtest)
library(sandwich)

#-----------------------------
#--Read in data, rearrange----
#-----------------------------

#--Read in original SCCS data---
load("SCCS.Rdata",.GlobalEnv)
#--Read in two weight matrices--
ll<-as.matrix(read.dta("langwm.dta")[,-1])
dd<-as.matrix(read.dta("dist25wm.dta")[,c(-1,-2,-189)])
#--Read in the imputed dataset---
load("impdat.Rdata",.GlobalEnv)

#HERE YOU CHANGE HOW THE DEPENDENT VARIABLE IS COMPUTED FOR A NEW PROJECT
#--create dep.varb. you wish to use from SCCS data--
#--Here we sum variables measuring how much a society values children--
#--can replace "sum" with "max"
###depvar<-apply(SCCS[,c("v473","v474","v475","v476")],1,sum) 
   depvar<-SCCS$v876 ### depvar CHANGED TO "self-amplifying"
#--find obs. for which dep. varb. is non-missing--
zdv<-which(!is.na(depvar))
depvar<-depvar[zdv]
#HERE GIVE THE "NAME" OF THE DEPENDENT VARIABLE THAT IS COMPUTED
depvarname<-"self-amplifying"  ### depvarname CHANGED DEPVAR TO "self-amplifying" v876
#--can add additional SCCS variable, but only if it has no missing values---
dateobs<-SCCS$v838
dateobs<-dateobs[zdv]

#--look at frequencies and quartiles for the dep. varb.--
summary(depvar)
table(depvar)

#--modify weight matrices---
#--set diagonal equal to zeros--
diag(ll)<-0
diag(dd)<-0
#--use only obs. where dep. varb. non-missing--
ll<-ll[zdv,zdv]
dd<-dd[zdv,zdv]
#--row standardize (rows sum to one)
ll<-ll/rowSums(ll)
dd<-dd/rowSums(dd)
#--make weight matrix object for later autocorrelation test--
wmatll<-mat2listw(as.matrix(ll))
wmatdd<-mat2listw(as.matrix(dd))

#MODIFY THESE STATEMENTS FOR A NEW PROJECT
indpv<-c("femsubs","foodscarc","exogamy","ncmallow","superjh","moralgods",
"fempower","sexratio","war","himilexp","wagelabor","famsize","settype",
"localjh","money","cultints","roots","cereals","gath","hunt","fish",
"anim","pigs","milk","plow","bovines","tree","foodtrade",
"ndrymonth","ecorich","popdens","pathstress","CVrain","rain",
"fratgrpstr", "marrcaptives", "smlpop",####ADDED three variables
"temp","AP1","AP2","migr","brideprice","nuclearfam")######,"pctFemPolyg")

#-----------------------------------------------------
#---Estimate model on each imputed dataset------------
#-----------------------------------------------------

#--number of imputed datasets--
nimp<-10

#--will append values to these empty objects--
vif<-NULL
ss<-NULL
beta<-NULL
dng<-NULL

#--loop through the imputed datasets--
for (i in 1:nimp){

#--select the ith imputed dataset--
m9<-impdat[which(impdat$.imp==i),]
#--retain only obs. for which dep. varb. is nonmissing--
m9<-m9[zdv,]

#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--create spatially lagged dep. varbs. in stage 1 OLS--
y<-as.matrix(depvar)
xx<-as.matrix(m9[,indpv])
#--for instruments we use the spatial lag of our indep. varbs.--
#--First, the spatially lagged varb. for distance--
xdy<-dd%*%xx
cyd<-dd%*%y
o<-lm(cyd~xdy)
#--the fitted value is our instrumental variable--
fydd<-fitted(o)
#--keep R2 from this regression--
dr2<-summary(o)$r.squared
#--Then, the spatially lagged varb. for language--
xly<-ll%*%xx   
cyl<-ll%*%y
o<-lm(cyl~xly)
#--the fitted value is our instrumental variable--
fyll<-fitted(o)
#--keep R2 from this regression--
lr2<-summary(o)$r.squared
m9<-cbind(m9,fydd,fyll)

#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--Stage 2 OLS estimate of unrestricted model--
xUR<-lm(depvar~fyll+fydd+dateobs+
cultints+roots+cereals+gath+plow+
hunt+fish+anim+pigs+milk+bovines+tree+foodtrade+foodscarc+
+ecorich+popdens+pathstress+exogamy+ncmallow+famsize+
settype+localjh+superjh+moralgods+fempower+femsubs+
sexratio+war+himilexp+money+wagelabor+
migr+brideprice+nuclearfam
+climate+plunder+fratgrpstr+marrcaptives+smlpop####ADDED three variables PLUS CLIMATE
,data=m9)

#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--Stage 2 OLS estimate of restricted model--
###xR<-lm(depvar ~ fyll + cultints + roots + fish + 
###    exogamy + settype + femsubs, data = m9)
###REDEFINED xR in the next command: Only those significant
xR<-lm(depvar~fyll+fydd+###dateobs+ ###Take out this one variables
plow+anim+milk+foodtrade
+ecorich+fish####ADDED from existing independent variables
+climate+plunder+fratgrpstr+marrcaptives+smlpop####ADDED three variables PLUS CLIMATE
####+FratIntPres+plunder####newly defined v569, 912###ALSO newly added to INDEP VARS
##cultints+roots+cereals+gath+plow+
##hunt+fish+anim+pigs+milk+bovines+tree+foodtrade+foodscarc+
##+ecorich+popdens+pathstress+exogamy+ncmallow+famsize+
##settype+localjh+superjh+moralgods+fempower+femsubs+
##sexratio+war+himilexp+money+wagelabor+
##migr+brideprice+nuclearfam
,data=m9)


#--corrected sigma2 and R2 for 2SLS--
qxx<-m9
qxx[,"fydd"]<-cyd
qxx[,"fyll"]<-cyl
b<-coef(xR)
incpt<-matrix(1,NROW(qxx),1)
x<-as.matrix(cbind(incpt,qxx[,names(b)[-1]]))
e<-y-x%*%as.matrix(b)
cs2<-as.numeric(t(e)%*%e/(NROW(x)-NCOL(x)))
cr2<-as.numeric(1-t(e)%*%e/sum((y-mean(y))^2))

#--collect coefficients and their variances--
ov<-summary(xR)
vif<-rbind(vif,vif(xR))
ss<-rbind(ss,diag(ov$cov*cs2))
#--collect robust coef. variances when there is heteroskedasticity--
#eb<-e^2
#x<-as.matrix(cbind(incpt,m9[,names(b)[-1]]))
#hcm<-inv(t(x)%*%x)%*%t(x)%*%diag(eb[1:length(eb)])%*%x%*%inv(t(x)%*%x)
#ss<-rbind(ss,diag(hcm))
beta<-rbind(beta,coef(xR))

#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--collect some model diagnostics--
dropt<-c("cereals","gath","plow","hunt","anim","dateobs",
"pigs","milk","bovines","foodscarc","ecorich",
"popdens","pathstress","ncmallow","famsize","localjh",
"superjh","moralgods","fempower","sexratio","money",
"climate","plunder","fratgrpstr","marrcaptives","smlpop",####ADDED three variables PLUS CLIMATE
"fydd","wagelabor","war","himilexp","tree","foodtrade")


#--Ramsey RESET test--
p1<-qchisq(resettest(xR,type="fitted")$"p.value",1,lower.tail=FALSE)
#--Wald test (H0: dropped variables have coefficient equal zero)--
o<-linear.hypothesis(xUR,dropt,test="Chisq")$"Pr(>Chisq)"[2]
p2<-qchisq(o,1,lower.tail=FALSE) #find Chisq with 1 d.f. and same pvalue
#--Heteroskedasticity test (H0: homoskedastic residuals)--
p3<-ncv.test(xR)$ChiSquare
#--Shapiro-Wilke normality test (H0: residuals normal)
p4<-qchisq(shapiro.test(e)$p.value,1,lower.tail=FALSE)
#--LaGrange Multiplier test for spatial autocorrelation: language--
o<-lm.LMtests(xR, wmatll, test=c("LMlag"))
p5<-as.numeric(o$LMlag$statistic)
#--LaGrange Multiplier test for spatial autocorrelation: distance--
o<-lm.LMtests(xR, wmatdd, test=c("LMlag"))
p6<-as.numeric(o$LMlag$statistic)
#--model R2--
p7<-cr2
dng<-rbind(dng,cbind(p1,p2,p3,p4,p5,p6,p7,dr2,lr2))

}

#--------------------------------------------
#--Rubin's formulas for combining estimates--
#--------------------------------------------

#--first find final regr. coefs. and p-values--
mnb<-apply(beta,2,mean)
vrb<-colSums((beta-t(matrix(mnb,length(mnb),10)))^2)/(nimp-1)
mnv<-apply(ss,2,mean)
vrT<-mnv+vrb*(1-nimp^(-1))
fst<-mnb^2/vrT
r<-(1+nimp^(-1))*vrb/mnv
v<-(nimp-1)*(1+r^(-1))^2
pval<-pf(fst,1,v,lower.tail=FALSE)
bbb<-data.frame(round(cbind(mnb,fst,v,pval),3))
bbb$VIF[2:NROW(bbb)]<-round(apply(vif,2,mean),3)
names(bbb)<-c("coef","Fstat","ddf","pvalue","VIF")

#--Then combine the diagnostics we collected--
dng<-data.frame(dng)
names(dng)<-c("RESET","Wald on restrs.","NCV","SWnormal","lagll","lagdd",
"R2:final model","R2:IV(distance)","R2:IV(language)")
r2<-apply(dng[,7:9],2,mean)
adng<-dng[,1:6]
mdm<-apply(adng,2,mean)
vrd<-colSums((adng-t(matrix(mdm,length(mdm),nimp)))^2)/(nimp-1)
aa<-4*mdm^2-2*vrd
aa[which(aa<0)]<-0
rd<-(1+nimp^(-1))*vrd/(2*mdm+aa^.5)
vd<-(nimp-1)*(1+rd^(-1))^2
Dm<-(mdm-(nimp-1)/(nimp+1)*rd)/(1+rd)
#-All chi-sq we collected have df=1-------
pvald<-pf(Dm,1,vd,lower.tail=FALSE)
ccc<-data.frame(round(cbind(Dm,vd,pvald),3))
names(ccc)<-c("Fstat","df","pvalue")

bbb
r2
ccc

#Corrected to publication version with depvarname
#--write results to csv file for perusal in spreadsheet--
write.csv(paste("2SLS model for ",depvarname,sep=""),file="OLSresults.csv", append=FALSE)
write.csv(bbb,file="OLSresults.csv",append=TRUE)
write.csv(r2,file="OLSresults.csv",append=TRUE)
write.csv(ccc,file="OLSresults.csv",append=TRUE)

[edit] Results for self-amplifying polygyny v876 R2=.39 (poor VIF)

876.  Polygyny Distributions
   129    . = Missing data <-- lots of missing data
    27    0 = True Binomial
    30    1 = Negative Binomial
                
bbb
1	2SLS model for self-amplifying (polygyny)
               coef Fstat        ddf pvalue   VIF
(Intercept)   1.062 1.113  23647.208  0.291    NA
fyll          0.638 0.877   3416.281  0.349 2.421 <-- not significant
fydd         -0.403 0.702  46094.103  0.402 3.117 <-- not significant
plow         -0.549 2.532  14617.922  0.112 3.360 <-- not significant
anim         -0.024 0.228   9004.785  0.633 3.359 <-- not significant
milk          0.171 0.284  13238.107  0.594 5.282 <-- not significant
foodtrade     0.007 0.565 186303.681  0.452 1.714 <-- not significant
ecorich       0.001 0.000  23599.146  0.993 7.382 <-- not significant
fish         -0.050 0.563  28042.863  0.453 1.591 <-- not significant
climate      -0.027 0.014   8505.959  0.905 7.823 <-- not significant
plunder      -0.139 0.507    495.348  0.477 1.957 <-- not significant
fratgrpstr    0.041 0.228    174.068  0.634 2.854 <-- not significant
marrcaptives  0.155 2.271    934.925  0.132 2.294 <-- not significant
smlpop       -0.038 0.029  11313.618  0.864 2.905 <-- not significant

PROBLEM WITH R2 is too few cases (n=52). The VIF values are way way too high, e.g. ecorich, milk, anim, plow

>  r2
 R2:final model R2:IV(distance) R2:IV(language) 
      0.3921506       0.9946715       0.9955094 
>  ccc
                 Fstat         df pvalue
RESET            0.131   2795.626  0.718
Wald on restrs. -0.322     10.328  1.000
NCV              0.080   2977.227  0.777
SWnormal         0.629   5214.671  0.428
lagll            0.025 951136.960  0.875
lagdd            0.009 812955.724  0.923
>

depvar

[1] 1 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 2 2 1 2 1 1 1 2 2 1 2 1
[39] 1 1 1 2 1 2 2 1 2 1 1 1 2

[edit] Crosstabs

setwd("c:/My Documents/MI")
#--Read in the SCCS dataset---
load("SCCS.Rdata",.GlobalEnv)
dbl_std_sex=SCCS$v596
hus_wife_deference=SCCS$v615
library(gmodels)
tab=cbind(hus_wife_deference,dbl_std_sex)
tabl<-na.omit(tab)  #eliminate cases with missing data 
x=tabl[,1] #take variable for those cases
y=tabl[,2] #take variable for those cases
CrossTable(x,y,prop.r=FALSE, prop.c=FALSE, prop.t=FALSE, expected=TRUE)

Bela Nagy: Note that CrossTable will include NaN values which means not a number. The missing value is coded as NA in R. So changing all the NaN values to NA allows you to have missing values which CrossTable will not include. The quickest way to convert NaN to NA is this for variable x:

x[is.nan(x)]=NA
dbl_std_sex[is.nan(dbl_std_sex)]=NA
hus_wife_deference[is.nan(hus_wife_deference)]=NA
dbl_std_sex
hus_wife_deference
CrossTable( dbl_std_sex,hus_wife_deference,prop.r=FALSE, prop.c=FALSE, prop.t=FALSE, expected=TRUE)

This gives a nice table for a paper.