Depvar percent polygyny v827 R2=.40
From InterSciWiki
Contents |
[edit] 1
Back to Depvars. This is a Depvar series where Doug 08:02, 13 October 2009 (PDT) is experimenting with changing the dependent variable for different ways of measuring polygyny.
[edit] B| Results for polygyny=v860 r2=.50
These are results from polygyny v860 R2=.50
Still, Wald test shows omitted variable(s) Wald test H0: that the true values of these coefficients of dropped variables equal zero (none significant) p14 Eff and Dow 2009.
Ok, but the "dropt" variable list needs to include ALL THE OMITTED VARIABLED as in subheading 2A| 2.3. And lets at this point eliminate "plow"
> bbb
1 2SLS model for polygyny (depvar)
coef Fstat ddf pvalue VIF
(Intercept) 1.460 1.198 52027.088 0.274 NA
fyll -0.285 0.384 11482.367 0.535 2.192
fydd 0.721 14.820 1449.677 0.000 2.531
plow -0.163 0.330 4641.687 0.566 2.018
anim 0.059 1.008 899.793 0.316 2.582
milk -0.528 3.661 3333.265 0.056 2.771 <-- milk significant but high VIF
foodtrade -0.013 2.553 6786.690 0.110 1.166
ecorich -0.204 3.887 300.740 0.050 2.570
fish -0.119 5.537 7023.152 0.019 1.450
climate 0.514 8.906 753.015 0.003 2.468
plunder -0.625 11.515 783.265 0.001 1.296
fratgrpstr 0.264 7.442 53.750 0.009 1.962
marrcaptives 0.329 10.381 210.246 0.001 1.388
smlpop 0.245 1.244 813.703 0.265 1.934
> r2
R2:final model R2:IV(distance) R2:IV(language)
0.4963125 0.9866688 0.9812159
> ccc
Fstat df pvalue
RESET 1.984 536.228 0.160
Wald on restrs. 40.017 161.529 0.000 <-- Still omitted variable(s)
NCV 0.060 2135.191 0.807
SWnormal 0.179 5833.837 0.672
lagll 1.193 333238.710 0.275
lagdd 2.923 253018.766 0.087
>
[edit] A| substitute pctFemPolyg=SCCS$v872 for polygyny=v860
######(dropped in 4 places from independent variables)
Program 1 and 2 from Eff and Dow 2009
#MI--make the imputed datasets
#--change the following path to the directory with your data and program--
setwd("c:/My Documents/MI")
rm(list=ls(all=TRUE))
options(echo=TRUE)
#--you need the following two packages--you must install them first--
library(foreign)
library(mice)
#--To find the citation for a package, use this function:---
citation("mice")
#-----------------------------
#--Read in data, rearrange----
#-----------------------------
#--Read in auxiliary variables---
load("vaux.Rdata",.GlobalEnv)
row.names(vaux)<-NULL
#--Read in the SCCS dataset---
load("SCCS.Rdata",.GlobalEnv)
#--look at first 6 rows of vaux--
head(vaux)
#--look at field names of vaux--
names(vaux)
#--check to see that rows are properly aligned in the two datasets--
#--sum should equal 186---
sum((SCCS$socname==vaux$socname)*1)
#--remove the society name field--
vaux<-vaux[,-28]
names(vaux)
#--Two nominal variables: brg and rlg----
#--brg: consolidated Burton Regions-----
#0 = (rest of world) circumpolar, South and Meso-America, west North America
#1 = Subsaharan Africa
#2 = Middle Old World
#3 = Southeast Asia, Insular Pacific, Sahul
#4 = Eastern Americas
#--rlg: Religion---
#'0 (no world religion)'
#'1 (Christianity)'
#'2 (Islam)'
#'3 (Hindu/Buddhist)'
#--check to see number of missing values in vaux,
#--whether variables are numeric,
#--and number of discrete values for each variable---
vvn<-names(vaux)
pp<-NULL
for (i in 1:length(vvn)){
nmiss<-length(which(is.na(vaux[,vvn[i]])))
numeric<-is.numeric(vaux[,vvn[i]])
numDiscrVals<-length(table(vaux[,vvn[i]]))
pp<-rbind(pp,cbind(data.frame(numeric),nmiss,numDiscrVals))
}
row.names(pp)<-vvn
pp
#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--extract variables to be used from SCCS, put in dataframe fx--
fx<-data.frame(
socname=SCCS$socname,socID=SCCS$"sccs#",
valchild=(SCCS$v473+SCCS$v474+SCCS$v475+SCCS$v476),
cultints=SCCS$v232,roots=(SCCS$v233==5)*1,
cereals=(SCCS$v233==6)*1,gath=SCCS$v203,hunt=SCCS$v204,
fish=SCCS$v205,anim=SCCS$v206,femsubs=SCCS$v890,
pigs=(SCCS$v244==2)*1,milk=(SCCS$v245>1)*1,plow=(SCCS$v243>1)*1,
bovines=(SCCS$v244==7)*1,tree=(SCCS$v233==4)*1,
foodtrade=SCCS$v819,foodscarc=SCCS$v1685,
ecorich=SCCS$v857,popdens=SCCS$v156,pathstress=SCCS$v1260,
CVrain=SCCS$v1914/SCCS$v1913,rain=SCCS$v854,temp=SCCS$v855,
AP1=SCCS$v921,AP2=SCCS$v928,ndrymonth=SCCS$v196,
exogamy=SCCS$v72,ncmallow=SCCS$v227,famsize=SCCS$v80,
settype=SCCS$v234,localjh=(SCCS$v236-1),superjh=SCCS$v237,
moralgods=SCCS$v238,fempower=SCCS$v663,
sexratio=1+(SCCS$v1689>85)+(SCCS$v1689>115),
war=SCCS$v1648,himilexp=(SCCS$v899==1)*1,
money=SCCS$v155,wagelabor=SCCS$v1732,
migr=(SCCS$v677==2)*1,brideprice=(SCCS$v208==1)*1,
nuclearfam=(SCCS$v210<=3)*1######,pctFemPolyg=SCCS$v872##THIS IS NOW A DEPVAR not an indep var
,plunder=SCCS$v912,fratgrpstr=SCCS$v570, ####ADDED three variables####
climate<-(SCCS$v857==1 | SCCS$v857==2 | SCCS$v857==6)*1+(SCCS$v857==3 | #####ADDED CLIMATE
SCCS$v857==4)*2+(SCCS$v857==5)*3 #####ADDED CLIMATE
#####--Should double-check everything
,cbind(SCCS$v857,climate), #####ADDED CLIMATE
marrcaptives=SCCS$v870,smlpop=(SCCS$v1122<=3)*1####ADDED three variables####
)
#--look at first 6 rows of fx--
head(fx)
#--check to see number of missing values--
#--also check whether numeric--
vvn<-names(fx)
pp<-NULL
for (i in 1:length(vvn)){
nmiss<-length(which(is.na(fx[,vvn[i]])))
numeric<-is.numeric(fx[,vvn[i]])
pp<-rbind(pp,cbind(nmiss,data.frame(numeric)))
}
row.names(pp)<-vvn
pp
#--identify variables with missing values--
z<-which(pp[,1]>0)
zv1<-vvn[z]
zv1
#--identify variables with non-missing values--
z<-which(pp[,1]==0)
zv2<-vvn[z]
zv2
#-----------------------------
#----Multiple imputation------
#-----------------------------
#--number of imputed data sets to create--
nimp<-10
#--one at a time, loop through those variables with missing values--
for (i in 1:length(zv1)){
#--attach the imputand to the auxiliary data--
zxx<-data.frame(cbind(vaux,fx[,zv1[i]]))
#--in the following line, the imputation is done--
aqq<-complete(mice(zxx,maxit=100,m=nimp),action="long")
#--during first iteration of the loop, create dataframe impdat--
if (i==1){
impdat<-data.frame(aqq[,c(".id",".imp")])
}
#--the imputand is placed as a field in impdat and named--
impdat<-cbind(impdat,data.frame(aqq[,NCOL(zxx)]))
names(impdat)[NCOL(impdat)]<-zv1[i]
}
#--now the non-missing variables are attached to impdat--
gg<-NULL
for (i in 1:nimp){
gg<-rbind(gg,data.frame(fx[,zv2]))
}
impdat<-cbind(impdat,gg)
#--take a look at the top 6 and bottom 6 rows of impdat--
head(impdat)
tail(impdat)
#--impdat is saved as an R-format data file--
save(impdat,file="impdat.Rdata")
Program 2
#MI--estimate model with network-lagged dependent variables, combine results
rm(list=ls(all=TRUE))
#--Set path to your directory with data and program--
setwd("c:/My Documents/MI")
options(echo=TRUE)
#--need these packages for estimation and diagnostics--
library(foreign)
library(spdep)
library(car)
library(lmtest)
library(sandwich)
#-----------------------------
#--Read in data, rearrange----
#-----------------------------
#--Read in original SCCS data---
load("SCCS.Rdata",.GlobalEnv)
#--Read in two weight matrices--
ll<-as.matrix(read.dta("langwm.dta")[,-1])
dd<-as.matrix(read.dta("dist25wm.dta")[,c(-1,-2,-189)])
#--Read in the imputed dataset---
load("impdat.Rdata",.GlobalEnv)
#HERE YOU CHANGE HOW THE DEPENDENT VARIABLE IS COMPUTED FOR A NEW PROJECT
#--create dep.varb. you wish to use from SCCS data--
#--Here we sum variables measuring how much a society values children--
#--can replace "sum" with "max"
###depvar<-apply(SCCS[,c("v473","v474","v475","v476")],1,sum)
depvar<-SCCS$v872 ### CHANGED DEPENDENT VARIABLE TO "pctFemPolyg"
#--find obs. for which dep. varb. is non-missing--
zdv<-which(!is.na(depvar))
depvar<-depvar[zdv]
#HERE GIVE THE "NAME" OF THE DEPENDENT VARIABLE THAT IS COMPUTED
depvarname<-"pctFemPolyg" ###"child value" CHANGED DEPVAR TO "pctFemPolyg"
#--can add additional SCCS variable, but only if it has no missing values---
dateobs<-SCCS$v838
dateobs<-dateobs[zdv]
#--look at frequencies and quartiles for the dep. varb.--
summary(depvar)
table(depvar)
#--modify weight matrices---
#--set diagonal equal to zeros--
diag(ll)<-0
diag(dd)<-0
#--use only obs. where dep. varb. non-missing--
ll<-ll[zdv,zdv]
dd<-dd[zdv,zdv]
#--row standardize (rows sum to one)
ll<-ll/rowSums(ll)
dd<-dd/rowSums(dd)
#--make weight matrix object for later autocorrelation test--
wmatll<-mat2listw(as.matrix(ll))
wmatdd<-mat2listw(as.matrix(dd))
#MODIFY THESE STATEMENTS FOR A NEW PROJECT
indpv<-c("femsubs","foodscarc","exogamy","ncmallow","superjh","moralgods",
"fempower","sexratio","war","himilexp","wagelabor","famsize","settype",
"localjh","money","cultints","roots","cereals","gath","hunt","fish",
"anim","pigs","milk","plow","bovines","tree","foodtrade",
"ndrymonth","ecorich","popdens","pathstress","CVrain","rain",
"fratgrpstr", "marrcaptives", "smlpop",####ADDED three variables
"temp","AP1","AP2","migr","brideprice","nuclearfam")######,"pctFemPolyg")
#-----------------------------------------------------
#---Estimate model on each imputed dataset------------
#-----------------------------------------------------
#--number of imputed datasets--
nimp<-10
#--will append values to these empty objects--
vif<-NULL
ss<-NULL
beta<-NULL
dng<-NULL
#--loop through the imputed datasets--
for (i in 1:nimp){
#--select the ith imputed dataset--
m9<-impdat[which(impdat$.imp==i),]
#--retain only obs. for which dep. varb. is nonmissing--
m9<-m9[zdv,]
#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--create spatially lagged dep. varbs. in stage 1 OLS--
y<-as.matrix(depvar)
xx<-as.matrix(m9[,indpv])
#--for instruments we use the spatial lag of our indep. varbs.--
#--First, the spatially lagged varb. for distance--
xdy<-dd%*%xx
cyd<-dd%*%y
o<-lm(cyd~xdy)
#--the fitted value is our instrumental variable--
fydd<-fitted(o)
#--keep R2 from this regression--
dr2<-summary(o)$r.squared
#--Then, the spatially lagged varb. for language--
xly<-ll%*%xx
cyl<-ll%*%y
o<-lm(cyl~xly)
#--the fitted value is our instrumental variable--
fyll<-fitted(o)
#--keep R2 from this regression--
lr2<-summary(o)$r.squared
m9<-cbind(m9,fydd,fyll)
#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--Stage 2 OLS estimate of unrestricted model--
xUR<-lm(depvar~fyll+fydd+dateobs+
cultints+roots+cereals+gath+plow+
hunt+fish+anim+pigs+milk+bovines+tree+foodtrade+foodscarc+
+ecorich+popdens+pathstress+exogamy+ncmallow+famsize+
settype+localjh+superjh+moralgods+fempower+femsubs+
sexratio+war+himilexp+money+wagelabor+
migr+brideprice+nuclearfam######+pctFemPolyg
+climate+plunder+fratgrpstr+marrcaptives+smlpop####ADDED three variables PLUS CLIMATE
,data=m9)
#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--Stage 2 OLS estimate of restricted model--
###xR<-lm(depvar ~ fyll + cultints + roots + fish +
### exogamy + settype + femsubs, data = m9)
###REDEFINED xR in the next command: Only those significant
xR<-lm(depvar~fyll+fydd+###dateobs+ ###Take out this one variables
plow+anim+milk+foodtrade
+ecorich+fish####ADDED from existing independent variables
+climate+plunder+fratgrpstr+marrcaptives+smlpop####ADDED three variables PLUS CLIMATE
####+FratIntPres+plunder####newly defined v569, 912###ALSO newly added to INDEP VARS
##cultints+roots+cereals+gath+plow+
##hunt+fish+anim+pigs+milk+bovines+tree+foodtrade+foodscarc+
##+ecorich+popdens+pathstress+exogamy+ncmallow+famsize+
##settype+localjh+superjh+moralgods+fempower+femsubs+
##sexratio+war+himilexp+money+wagelabor+
##migr+brideprice+nuclearfam######+pctFemPolyg
,data=m9)
#--corrected sigma2 and R2 for 2SLS--
qxx<-m9
qxx[,"fydd"]<-cyd
qxx[,"fyll"]<-cyl
b<-coef(xR)
incpt<-matrix(1,NROW(qxx),1)
x<-as.matrix(cbind(incpt,qxx[,names(b)[-1]]))
e<-y-x%*%as.matrix(b)
cs2<-as.numeric(t(e)%*%e/(NROW(x)-NCOL(x)))
cr2<-as.numeric(1-t(e)%*%e/sum((y-mean(y))^2))
#--collect coefficients and their variances--
ov<-summary(xR)
vif<-rbind(vif,vif(xR))
ss<-rbind(ss,diag(ov$cov*cs2))
#--collect robust coef. variances when there is heteroskedasticity--
#eb<-e^2
#x<-as.matrix(cbind(incpt,m9[,names(b)[-1]]))
#hcm<-inv(t(x)%*%x)%*%t(x)%*%diag(eb[1:length(eb)])%*%x%*%inv(t(x)%*%x)
#ss<-rbind(ss,diag(hcm))
beta<-rbind(beta,coef(xR))
#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--collect some model diagnostics--
dropt<-c("cereals","gath","plow","hunt","anim","dateobs",
"pigs","milk","bovines","foodscarc","ecorich",
"popdens","pathstress","ncmallow","famsize","localjh",
"superjh","moralgods","fempower","sexratio","money",
"climate","plunder","fratgrpstr","marrcaptives","smlpop",####ADDED three variables PLUS CLIMATE
"fydd","wagelabor","war","himilexp","tree","foodtrade")
#--Ramsey RESET test--
p1<-qchisq(resettest(xR,type="fitted")$"p.value",1,lower.tail=FALSE)
#--Wald test (H0: dropped variables have coefficient equal zero)--
o<-linear.hypothesis(xUR,dropt,test="Chisq")$"Pr(>Chisq)"[2]
p2<-qchisq(o,1,lower.tail=FALSE) #find Chisq with 1 d.f. and same pvalue
#--Heteroskedasticity test (H0: homoskedastic residuals)--
p3<-ncv.test(xR)$ChiSquare
#--Shapiro-Wilke normality test (H0: residuals normal)
p4<-qchisq(shapiro.test(e)$p.value,1,lower.tail=FALSE)
#--LaGrange Multiplier test for spatial autocorrelation: language--
o<-lm.LMtests(xR, wmatll, test=c("LMlag"))
p5<-as.numeric(o$LMlag$statistic)
#--LaGrange Multiplier test for spatial autocorrelation: distance--
o<-lm.LMtests(xR, wmatdd, test=c("LMlag"))
p6<-as.numeric(o$LMlag$statistic)
#--model R2--
p7<-cr2
dng<-rbind(dng,cbind(p1,p2,p3,p4,p5,p6,p7,dr2,lr2))
}
#--------------------------------------------
#--Rubin's formulas for combining estimates--
#--------------------------------------------
#--first find final regr. coefs. and p-values--
mnb<-apply(beta,2,mean)
vrb<-colSums((beta-t(matrix(mnb,length(mnb),10)))^2)/(nimp-1)
mnv<-apply(ss,2,mean)
vrT<-mnv+vrb*(1-nimp^(-1))
fst<-mnb^2/vrT
r<-(1+nimp^(-1))*vrb/mnv
v<-(nimp-1)*(1+r^(-1))^2
pval<-pf(fst,1,v,lower.tail=FALSE)
bbb<-data.frame(round(cbind(mnb,fst,v,pval),3))
bbb$VIF[2:NROW(bbb)]<-round(apply(vif,2,mean),3)
names(bbb)<-c("coef","Fstat","ddf","pvalue","VIF")
#--Then combine the diagnostics we collected--
dng<-data.frame(dng)
names(dng)<-c("RESET","Wald on restrs.","NCV","SWnormal","lagll","lagdd",
"R2:final model","R2:IV(distance)","R2:IV(language)")
r2<-apply(dng[,7:9],2,mean)
adng<-dng[,1:6]
mdm<-apply(adng,2,mean)
vrd<-colSums((adng-t(matrix(mdm,length(mdm),nimp)))^2)/(nimp-1)
aa<-4*mdm^2-2*vrd
aa[which(aa<0)]<-0
rd<-(1+nimp^(-1))*vrd/(2*mdm+aa^.5)
vd<-(nimp-1)*(1+rd^(-1))^2
Dm<-(mdm-(nimp-1)/(nimp+1)*rd)/(1+rd)
#-All chi-sq we collected have df=1-------
pvald<-pf(Dm,1,vd,lower.tail=FALSE)
ccc<-data.frame(round(cbind(Dm,vd,pvald),3))
names(ccc)<-c("Fstat","df","pvalue")
bbb
r2
ccc
#Corrected to publication version with depvarname
#--write results to csv file for perusal in spreadsheet--
write.csv(paste("2SLS model for ",depvarname,sep=""),file="OLSresults.csv", append=FALSE)
write.csv(bbb,file="OLSresults.csv",append=TRUE)
write.csv(r2,file="OLSresults.csv",append=TRUE)
write.csv(ccc,file="OLSresults.csv",append=TRUE)
> bbb
coef Fstat ddf pvalue VIF
(Intercept) -6.685 0.182 2907.569 0.670 NA fyll 0.593 1.600 36952.826 0.206 1.954 fydd 0.412 3.126 7220.881 0.077 2.428 plow -6.913 0.850 3400.677 0.357 2.218 anim 0.193 0.019 26798.345 0.891 2.590 milk -3.166 0.202 4752.296 0.653 2.846 foodtrade -0.017 0.008 4361.707 0.931 1.225 ecorich -4.516 2.340 2367.661 0.126 3.658 fish -4.706 14.131 21205.243 0.000 1.477 climate 9.942 3.989 863.506 0.046 3.289 plunder -6.783 2.167 879.830 0.141 1.330 fratgrpstr 4.589 4.563 171.737 0.034 1.926 marrcaptives 3.677 2.175 504.869 0.141 1.431 smlpop 10.155 3.777 1812.562 0.052 1.839 > r2
R2:final model R2:IV(distance) R2:IV(language)
0.4048324 0.9858379 0.9829404
> ccc
Fstat df pvalue
RESET 2.687 1956.390 0.101 Wald on restrs. 8.264 363.213 0.004 NCV 15.146 2301.375 0.000 SWnormal 9.321 7744.546 0.002 lagll 0.091 1237474.986 0.763 lagdd 1.145 57685.536 0.285
[edit] B| Results for pctFemPolyg R2=.40
> bbb
1 2SLS model for pctFemPolyg (polygyny depvar)
coef Fstat ddf pvalue VIF
(Intercept) -6.400 0.169 4569.307 0.681 NA
fyll 0.601 1.602 6442.466 0.206 1.959
fydd 0.416 3.157 12501.373 0.076 2.455
plow -6.984 0.877 28045.989 0.349 2.248
anim 0.237 0.028 11074.920 0.867 2.588
milk -3.291 0.217 6628.079 0.642 2.869
foodtrade -0.020 0.011 16451.304 0.916 1.221
ecorich -4.556 2.339 1830.552 0.126 3.679
fish -4.817 15.005 31693.269 0.000 1.455
climate 9.959 4.083 1606.577 0.043 3.290
plunder -7.068 2.459 1951.552 0.117 1.304
fratgrpstr 4.529 4.344 258.370 0.038 2.002
marrcaptives 3.716 2.456 15590.921 0.117 1.415
smlpop 10.489 3.967 2714.303 0.047 1.882
> r2
R2:final model R2:IV(distance) R2:IV(language)
0.4022099 0.9859886 0.9828142
> ccc
Fstat df pvalue
RESET 2.606 4155.811 0.107
Wald on restrs. 7.788 677.723 0.005
NCV 13.898 1150.388 0.000
SWnormal 9.893 4590.932 0.002
lagll 0.089 4037335.687 0.765
lagdd 1.168 1142869.805 0.280
>
[edit] A| substitute self-amplifying=SCCS$v876 for pctFemPolyg=v872 R2=.39
######(dropped in 4 places from independent variables)
Program 1 and 2 from Eff and Dow 2009
#MI--make the imputed datasets
#--change the following path to the directory with your data and program--
setwd("c:/My Documents/MI")
rm(list=ls(all=TRUE))
options(echo=TRUE)
#--you need the following two packages--you must install them first--
library(foreign)
library(mice)
#--To find the citation for a package, use this function:---
citation("mice")
#-----------------------------
#--Read in data, rearrange----
#-----------------------------
#--Read in auxiliary variables---
load("vaux.Rdata",.GlobalEnv)
row.names(vaux)<-NULL
#--Read in the SCCS dataset---
load("SCCS.Rdata",.GlobalEnv)
#--look at first 6 rows of vaux--
head(vaux)
#--look at field names of vaux--
names(vaux)
#--check to see that rows are properly aligned in the two datasets--
#--sum should equal 186---
sum((SCCS$socname==vaux$socname)*1)
#--remove the society name field--
vaux<-vaux[,-28]
names(vaux)
#--Two nominal variables: brg and rlg----
#--brg: consolidated Burton Regions-----
#0 = (rest of world) circumpolar, South and Meso-America, west North America
#1 = Subsaharan Africa
#2 = Middle Old World
#3 = Southeast Asia, Insular Pacific, Sahul
#4 = Eastern Americas
#--rlg: Religion---
#'0 (no world religion)'
#'1 (Christianity)'
#'2 (Islam)'
#'3 (Hindu/Buddhist)'
#--check to see number of missing values in vaux,
#--whether variables are numeric,
#--and number of discrete values for each variable---
vvn<-names(vaux)
pp<-NULL
for (i in 1:length(vvn)){
nmiss<-length(which(is.na(vaux[,vvn[i]])))
numeric<-is.numeric(vaux[,vvn[i]])
numDiscrVals<-length(table(vaux[,vvn[i]]))
pp<-rbind(pp,cbind(data.frame(numeric),nmiss,numDiscrVals))
}
row.names(pp)<-vvn
pp
#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--extract variables to be used from SCCS, put in dataframe fx--
fx<-data.frame(
socname=SCCS$socname,socID=SCCS$"sccs#",
valchild=(SCCS$v473+SCCS$v474+SCCS$v475+SCCS$v476),
cultints=SCCS$v232,roots=(SCCS$v233==5)*1,
cereals=(SCCS$v233==6)*1,gath=SCCS$v203,hunt=SCCS$v204,
fish=SCCS$v205,anim=SCCS$v206,femsubs=SCCS$v890,
pigs=(SCCS$v244==2)*1,milk=(SCCS$v245>1)*1,plow=(SCCS$v243>1)*1,
bovines=(SCCS$v244==7)*1,tree=(SCCS$v233==4)*1,
foodtrade=SCCS$v819,foodscarc=SCCS$v1685,
ecorich=SCCS$v857,popdens=SCCS$v156,pathstress=SCCS$v1260,
CVrain=SCCS$v1914/SCCS$v1913,rain=SCCS$v854,temp=SCCS$v855,
AP1=SCCS$v921,AP2=SCCS$v928,ndrymonth=SCCS$v196,
exogamy=SCCS$v72,ncmallow=SCCS$v227,famsize=SCCS$v80,
settype=SCCS$v234,localjh=(SCCS$v236-1),superjh=SCCS$v237,
moralgods=SCCS$v238,fempower=SCCS$v663,
sexratio=1+(SCCS$v1689>85)+(SCCS$v1689>115),
war=SCCS$v1648,himilexp=(SCCS$v899==1)*1,
money=SCCS$v155,wagelabor=SCCS$v1732,
migr=(SCCS$v677==2)*1,brideprice=(SCCS$v208==1)*1,
nuclearfam=(SCCS$v210<=3)*1
,plunder=SCCS$v912,fratgrpstr=SCCS$v570, ####ADDED three variables####
climate<-(SCCS$v857==1 | SCCS$v857==2 | SCCS$v857==6)*1+(SCCS$v857==3 | #####ADDED CLIMATE
SCCS$v857==4)*2+(SCCS$v857==5)*3 #####ADDED CLIMATE
#####--Should double-check everything
,cbind(SCCS$v857,climate), #####ADDED CLIMATE
marrcaptives=SCCS$v870,smlpop=(SCCS$v1122<=3)*1####ADDED three variables####
)
#--look at first 6 rows of fx--
head(fx)
#--check to see number of missing values--
#--also check whether numeric--
vvn<-names(fx)
pp<-NULL
for (i in 1:length(vvn)){
nmiss<-length(which(is.na(fx[,vvn[i]])))
numeric<-is.numeric(fx[,vvn[i]])
pp<-rbind(pp,cbind(nmiss,data.frame(numeric)))
}
row.names(pp)<-vvn
pp
#--identify variables with missing values--
z<-which(pp[,1]>0)
zv1<-vvn[z]
zv1
#--identify variables with non-missing values--
z<-which(pp[,1]==0)
zv2<-vvn[z]
zv2
#-----------------------------
#----Multiple imputation------
#-----------------------------
#--number of imputed data sets to create--
nimp<-10
#--one at a time, loop through those variables with missing values--
for (i in 1:length(zv1)){
#--attach the imputand to the auxiliary data--
zxx<-data.frame(cbind(vaux,fx[,zv1[i]]))
#--in the following line, the imputation is done--
aqq<-complete(mice(zxx,maxit=100,m=nimp),action="long")
#--during first iteration of the loop, create dataframe impdat--
if (i==1){
impdat<-data.frame(aqq[,c(".id",".imp")])
}
#--the imputand is placed as a field in impdat and named--
impdat<-cbind(impdat,data.frame(aqq[,NCOL(zxx)]))
names(impdat)[NCOL(impdat)]<-zv1[i]
}
#--now the non-missing variables are attached to impdat--
gg<-NULL
for (i in 1:nimp){
gg<-rbind(gg,data.frame(fx[,zv2]))
}
impdat<-cbind(impdat,gg)
#--take a look at the top 6 and bottom 6 rows of impdat--
head(impdat)
tail(impdat)
#--impdat is saved as an R-format data file--
save(impdat,file="impdat.Rdata")
Program 2
#MI--estimate model with network-lagged dependent variables, combine results
rm(list=ls(all=TRUE))
#--Set path to your directory with data and program--
setwd("c:/My Documents/MI")
options(echo=TRUE)
#--need these packages for estimation and diagnostics--
library(foreign)
library(spdep)
library(car)
library(lmtest)
library(sandwich)
#-----------------------------
#--Read in data, rearrange----
#-----------------------------
#--Read in original SCCS data---
load("SCCS.Rdata",.GlobalEnv)
#--Read in two weight matrices--
ll<-as.matrix(read.dta("langwm.dta")[,-1])
dd<-as.matrix(read.dta("dist25wm.dta")[,c(-1,-2,-189)])
#--Read in the imputed dataset---
load("impdat.Rdata",.GlobalEnv)
#HERE YOU CHANGE HOW THE DEPENDENT VARIABLE IS COMPUTED FOR A NEW PROJECT
#--create dep.varb. you wish to use from SCCS data--
#--Here we sum variables measuring how much a society values children--
#--can replace "sum" with "max"
###depvar<-apply(SCCS[,c("v473","v474","v475","v476")],1,sum)
depvar<-SCCS$v876 ### depvar CHANGED TO "self-amplifying"
#--find obs. for which dep. varb. is non-missing--
zdv<-which(!is.na(depvar))
depvar<-depvar[zdv]
#HERE GIVE THE "NAME" OF THE DEPENDENT VARIABLE THAT IS COMPUTED
depvarname<-"self-amplifying" ### depvarname CHANGED DEPVAR TO "self-amplifying" v876
#--can add additional SCCS variable, but only if it has no missing values---
dateobs<-SCCS$v838
dateobs<-dateobs[zdv]
#--look at frequencies and quartiles for the dep. varb.--
summary(depvar)
table(depvar)
#--modify weight matrices---
#--set diagonal equal to zeros--
diag(ll)<-0
diag(dd)<-0
#--use only obs. where dep. varb. non-missing--
ll<-ll[zdv,zdv]
dd<-dd[zdv,zdv]
#--row standardize (rows sum to one)
ll<-ll/rowSums(ll)
dd<-dd/rowSums(dd)
#--make weight matrix object for later autocorrelation test--
wmatll<-mat2listw(as.matrix(ll))
wmatdd<-mat2listw(as.matrix(dd))
#MODIFY THESE STATEMENTS FOR A NEW PROJECT
indpv<-c("femsubs","foodscarc","exogamy","ncmallow","superjh","moralgods",
"fempower","sexratio","war","himilexp","wagelabor","famsize","settype",
"localjh","money","cultints","roots","cereals","gath","hunt","fish",
"anim","pigs","milk","plow","bovines","tree","foodtrade",
"ndrymonth","ecorich","popdens","pathstress","CVrain","rain",
"fratgrpstr", "marrcaptives", "smlpop",####ADDED three variables
"temp","AP1","AP2","migr","brideprice","nuclearfam")######,"pctFemPolyg")
#-----------------------------------------------------
#---Estimate model on each imputed dataset------------
#-----------------------------------------------------
#--number of imputed datasets--
nimp<-10
#--will append values to these empty objects--
vif<-NULL
ss<-NULL
beta<-NULL
dng<-NULL
#--loop through the imputed datasets--
for (i in 1:nimp){
#--select the ith imputed dataset--
m9<-impdat[which(impdat$.imp==i),]
#--retain only obs. for which dep. varb. is nonmissing--
m9<-m9[zdv,]
#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--create spatially lagged dep. varbs. in stage 1 OLS--
y<-as.matrix(depvar)
xx<-as.matrix(m9[,indpv])
#--for instruments we use the spatial lag of our indep. varbs.--
#--First, the spatially lagged varb. for distance--
xdy<-dd%*%xx
cyd<-dd%*%y
o<-lm(cyd~xdy)
#--the fitted value is our instrumental variable--
fydd<-fitted(o)
#--keep R2 from this regression--
dr2<-summary(o)$r.squared
#--Then, the spatially lagged varb. for language--
xly<-ll%*%xx
cyl<-ll%*%y
o<-lm(cyl~xly)
#--the fitted value is our instrumental variable--
fyll<-fitted(o)
#--keep R2 from this regression--
lr2<-summary(o)$r.squared
m9<-cbind(m9,fydd,fyll)
#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--Stage 2 OLS estimate of unrestricted model--
xUR<-lm(depvar~fyll+fydd+dateobs+
cultints+roots+cereals+gath+plow+
hunt+fish+anim+pigs+milk+bovines+tree+foodtrade+foodscarc+
+ecorich+popdens+pathstress+exogamy+ncmallow+famsize+
settype+localjh+superjh+moralgods+fempower+femsubs+
sexratio+war+himilexp+money+wagelabor+
migr+brideprice+nuclearfam
+climate+plunder+fratgrpstr+marrcaptives+smlpop####ADDED three variables PLUS CLIMATE
,data=m9)
#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--Stage 2 OLS estimate of restricted model--
###xR<-lm(depvar ~ fyll + cultints + roots + fish +
### exogamy + settype + femsubs, data = m9)
###REDEFINED xR in the next command: Only those significant
xR<-lm(depvar~fyll+fydd+###dateobs+ ###Take out this one variables
plow+anim+milk+foodtrade
+ecorich+fish####ADDED from existing independent variables
+climate+plunder+fratgrpstr+marrcaptives+smlpop####ADDED three variables PLUS CLIMATE
####+FratIntPres+plunder####newly defined v569, 912###ALSO newly added to INDEP VARS
##cultints+roots+cereals+gath+plow+
##hunt+fish+anim+pigs+milk+bovines+tree+foodtrade+foodscarc+
##+ecorich+popdens+pathstress+exogamy+ncmallow+famsize+
##settype+localjh+superjh+moralgods+fempower+femsubs+
##sexratio+war+himilexp+money+wagelabor+
##migr+brideprice+nuclearfam
,data=m9)
#--corrected sigma2 and R2 for 2SLS--
qxx<-m9
qxx[,"fydd"]<-cyd
qxx[,"fyll"]<-cyl
b<-coef(xR)
incpt<-matrix(1,NROW(qxx),1)
x<-as.matrix(cbind(incpt,qxx[,names(b)[-1]]))
e<-y-x%*%as.matrix(b)
cs2<-as.numeric(t(e)%*%e/(NROW(x)-NCOL(x)))
cr2<-as.numeric(1-t(e)%*%e/sum((y-mean(y))^2))
#--collect coefficients and their variances--
ov<-summary(xR)
vif<-rbind(vif,vif(xR))
ss<-rbind(ss,diag(ov$cov*cs2))
#--collect robust coef. variances when there is heteroskedasticity--
#eb<-e^2
#x<-as.matrix(cbind(incpt,m9[,names(b)[-1]]))
#hcm<-inv(t(x)%*%x)%*%t(x)%*%diag(eb[1:length(eb)])%*%x%*%inv(t(x)%*%x)
#ss<-rbind(ss,diag(hcm))
beta<-rbind(beta,coef(xR))
#MODIFY THESE STATEMENTS FOR A NEW PROJECT
#--collect some model diagnostics--
dropt<-c("cereals","gath","plow","hunt","anim","dateobs",
"pigs","milk","bovines","foodscarc","ecorich",
"popdens","pathstress","ncmallow","famsize","localjh",
"superjh","moralgods","fempower","sexratio","money",
"climate","plunder","fratgrpstr","marrcaptives","smlpop",####ADDED three variables PLUS CLIMATE
"fydd","wagelabor","war","himilexp","tree","foodtrade")
#--Ramsey RESET test--
p1<-qchisq(resettest(xR,type="fitted")$"p.value",1,lower.tail=FALSE)
#--Wald test (H0: dropped variables have coefficient equal zero)--
o<-linear.hypothesis(xUR,dropt,test="Chisq")$"Pr(>Chisq)"[2]
p2<-qchisq(o,1,lower.tail=FALSE) #find Chisq with 1 d.f. and same pvalue
#--Heteroskedasticity test (H0: homoskedastic residuals)--
p3<-ncv.test(xR)$ChiSquare
#--Shapiro-Wilke normality test (H0: residuals normal)
p4<-qchisq(shapiro.test(e)$p.value,1,lower.tail=FALSE)
#--LaGrange Multiplier test for spatial autocorrelation: language--
o<-lm.LMtests(xR, wmatll, test=c("LMlag"))
p5<-as.numeric(o$LMlag$statistic)
#--LaGrange Multiplier test for spatial autocorrelation: distance--
o<-lm.LMtests(xR, wmatdd, test=c("LMlag"))
p6<-as.numeric(o$LMlag$statistic)
#--model R2--
p7<-cr2
dng<-rbind(dng,cbind(p1,p2,p3,p4,p5,p6,p7,dr2,lr2))
}
#--------------------------------------------
#--Rubin's formulas for combining estimates--
#--------------------------------------------
#--first find final regr. coefs. and p-values--
mnb<-apply(beta,2,mean)
vrb<-colSums((beta-t(matrix(mnb,length(mnb),10)))^2)/(nimp-1)
mnv<-apply(ss,2,mean)
vrT<-mnv+vrb*(1-nimp^(-1))
fst<-mnb^2/vrT
r<-(1+nimp^(-1))*vrb/mnv
v<-(nimp-1)*(1+r^(-1))^2
pval<-pf(fst,1,v,lower.tail=FALSE)
bbb<-data.frame(round(cbind(mnb,fst,v,pval),3))
bbb$VIF[2:NROW(bbb)]<-round(apply(vif,2,mean),3)
names(bbb)<-c("coef","Fstat","ddf","pvalue","VIF")
#--Then combine the diagnostics we collected--
dng<-data.frame(dng)
names(dng)<-c("RESET","Wald on restrs.","NCV","SWnormal","lagll","lagdd",
"R2:final model","R2:IV(distance)","R2:IV(language)")
r2<-apply(dng[,7:9],2,mean)
adng<-dng[,1:6]
mdm<-apply(adng,2,mean)
vrd<-colSums((adng-t(matrix(mdm,length(mdm),nimp)))^2)/(nimp-1)
aa<-4*mdm^2-2*vrd
aa[which(aa<0)]<-0
rd<-(1+nimp^(-1))*vrd/(2*mdm+aa^.5)
vd<-(nimp-1)*(1+rd^(-1))^2
Dm<-(mdm-(nimp-1)/(nimp+1)*rd)/(1+rd)
#-All chi-sq we collected have df=1-------
pvald<-pf(Dm,1,vd,lower.tail=FALSE)
ccc<-data.frame(round(cbind(Dm,vd,pvald),3))
names(ccc)<-c("Fstat","df","pvalue")
bbb
r2
ccc
#Corrected to publication version with depvarname
#--write results to csv file for perusal in spreadsheet--
write.csv(paste("2SLS model for ",depvarname,sep=""),file="OLSresults.csv", append=FALSE)
write.csv(bbb,file="OLSresults.csv",append=TRUE)
write.csv(r2,file="OLSresults.csv",append=TRUE)
write.csv(ccc,file="OLSresults.csv",append=TRUE)
[edit] Results for self-amplifying polygyny v876 R2=.39 (poor VIF)
876. Polygyny Distributions
129 . = Missing data <-- lots of missing data
27 0 = True Binomial
30 1 = Negative Binomial
bbb
1 2SLS model for self-amplifying (polygyny)
coef Fstat ddf pvalue VIF
(Intercept) 1.062 1.113 23647.208 0.291 NA
fyll 0.638 0.877 3416.281 0.349 2.421 <-- not significant
fydd -0.403 0.702 46094.103 0.402 3.117 <-- not significant
plow -0.549 2.532 14617.922 0.112 3.360 <-- not significant
anim -0.024 0.228 9004.785 0.633 3.359 <-- not significant
milk 0.171 0.284 13238.107 0.594 5.282 <-- not significant
foodtrade 0.007 0.565 186303.681 0.452 1.714 <-- not significant
ecorich 0.001 0.000 23599.146 0.993 7.382 <-- not significant
fish -0.050 0.563 28042.863 0.453 1.591 <-- not significant
climate -0.027 0.014 8505.959 0.905 7.823 <-- not significant
plunder -0.139 0.507 495.348 0.477 1.957 <-- not significant
fratgrpstr 0.041 0.228 174.068 0.634 2.854 <-- not significant
marrcaptives 0.155 2.271 934.925 0.132 2.294 <-- not significant
smlpop -0.038 0.029 11313.618 0.864 2.905 <-- not significant
PROBLEM WITH R2 is too few cases (n=52). The VIF values are way way too high, e.g. ecorich, milk, anim, plow
> r2
R2:final model R2:IV(distance) R2:IV(language)
0.3921506 0.9946715 0.9955094
> ccc
Fstat df pvalue
RESET 0.131 2795.626 0.718
Wald on restrs. -0.322 10.328 1.000
NCV 0.080 2977.227 0.777
SWnormal 0.629 5214.671 0.428
lagll 0.025 951136.960 0.875
lagdd 0.009 812955.724 0.923
>
depvar
[1] 1 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 2 2 1 2 1 1 1 2 2 1 2 1 [39] 1 1 1 2 1 2 2 1 2 1 1 1 2
[edit] Crosstabs
setwd("c:/My Documents/MI")
#--Read in the SCCS dataset---
load("SCCS.Rdata",.GlobalEnv)
dbl_std_sex=SCCS$v596 hus_wife_deference=SCCS$v615 library(gmodels) tab=cbind(hus_wife_deference,dbl_std_sex) tabl<-na.omit(tab) #eliminate cases with missing data x=tabl[,1] #take variable for those cases y=tabl[,2] #take variable for those cases CrossTable(x,y,prop.r=FALSE, prop.c=FALSE, prop.t=FALSE, expected=TRUE)
Bela Nagy: Note that CrossTable will include NaN values which means not a number. The missing value is coded as NA in R. So changing all the NaN values to NA allows you to have missing values which CrossTable will not include. The quickest way to convert NaN to NA is this for variable x:
x[is.nan(x)]=NA dbl_std_sex[is.nan(dbl_std_sex)]=NA hus_wife_deference[is.nan(hus_wife_deference)]=NA dbl_std_sex hus_wife_deference CrossTable( dbl_std_sex,hus_wife_deference,prop.r=FALSE, prop.c=FALSE, prop.t=FALSE, expected=TRUE)
This gives a nice table for a paper.
