MW example

From InterSciWiki

Jump to: navigation, search
######################################################
####  Cohesive Blocking Algorithm and Utilities   ####
######################################################
####           Written by Peter McMahan           ####
######################################################
require(RBGL) ## OK to not have RBGL, but it speeds some stuff up
# Need RSQLite too
require(igraph, warn.conflicts = FALSE, quietly=TRUE)
require(digest)
######################################################
####          A note on parallelization           ####
######################################################
## Many of these functions have an optional argument##
## `cl`. If used, this should be a cluster object   ##
## from the package "snow". I used "Rmpi" to use    ##
## this functionality.                              ##
######################################################
######################################################
## Main cohesive blocking function. Feed it a graph ##
## of class igraph and it returns an object of      ##
## class bgraph. Definitely requires digest, and    ##
## will use RSQLite for large graphs if it is avai- ##
## lable.                                           ##
######################################################
cohesive.blocks <- function(G,db=NULL, useDB=NULL,cl=NULL){
   aigraph <- "package:igraph" %in% search()
   if(aigraph){
    silentwarnings <- capture.output(
     detach(package:igraph,unload=TRUE)
    )
   }
   anetwork <- "package:network" %in% search()
   if(!anetwork){
    silentwarnings <- capture.output(
     require(network, warn.conflicts = FALSE, quietly=TRUE)
    )
   }
   if(is.null(useDB)){
    useDB <- network.size(G)>400 & require(RSQLite)
   }
   if(useDB & !require(RSQLite)) stop("package `RSQLite` required")
   if(!require(digest)) stop("package `digest` required")
   if(!is.network(G)) stop("g must be an network object")
   
   G <- network2igraph(G)
   Gin <- G
   G <- simplify(as.undirected2(G))
   V(G)$cbid <- as.numeric(V(G))
   infCohesion <- vcount(G)+1 ## for fully connected subcomponents
   g <- G 
   v <- sort(as.numeric(V(g)))
   
   ## set up database connection and schema
   if(is.null(db) & useDB){
       ## initialize the database and define schema
       dbfile <- tempfile("cohesive.blocks",getwd())
       con <- dbConnect(dbDriver("SQLite"),dbname=dbfile)
       ## branchMembership
       dbSendQuery(con,"create table branchMembership (vertexId integer, branchId integer)")
       dbSendQuery(con,"create index index_membership on branchMembership (vertexId, branchId)")
       dbSendQuery(con,"create index index_vertex on branchMembership (vertexId)")
       dbSendQuery(con,"create index index_branch on branchMembership (branchId)")
       ## branches (unexamined branches given cohesion of (-1))
       dbSendQuery(con,"create table branches (branchId integer primary key, membershipHash text, branchCohesion integer default -1, maxAncestorCohesion integer, isCohesiveBlock integer default 0)")
       dbSendQuery(con,"create index index_hash on branches (membershipHash)")
       ## subBranches
       dbSendQuery(con,"create table subBranches (parentId integer, childId integer)")
       dbSendQuery(con,"create index index_parentId on subBranches (parentId)")
       
       ##
       ## Add the root branch to the whole graph
       ##
       ## create the hash
       v <- sort(as.numeric(V(g)))
       thisHash <- digest(paste(paste(v,collapse=" "),0))
       
       
       dbBeginTransaction(con)
       dbSendQuery(con,paste("insert into branches (branchId,membershipHash,maxAncestorCohesion,isCohesiveBlock) values(",0,",'",thisHash,"',",0,",",1,"); ",sep="")) 
       for(thisV in v){
           dbSendQuery(con,paste("insert into branchMembership values(",thisV,",",0,"); ",sep=""))
       }
       dbCommit(con)
       maxId <- 0
       
   }else if(useDB){
       ## connect to existing db
       dbfile <- db
       con <- dbConnect(dbDriver("SQLite"),dbname=dbfile)
       maxId <- dbGetQuery(con,"select max(branchId) from branches")
   }else{
       branchMembership <- data.frame(vertexId=sort(as.numeric(V(G))),branchId=0)
       thisHash <- digest(paste(paste(v,collapse=" "),0))
       branches <- data.frame(branchId=0,membershipHash=thisHash,branchCohesion=-1,maxAncestorCohesion=0,isCohesiveBlock=1)
       subBranches <- data.frame(parentId=NA,childId=NA)
   }
   
   
   ## now that setup's done, start looping
   notDone <- T
   while(notDone){
       ## which branch are we working on this time
       ## DATA ACCESS:
       if(useDB){
           branchId <- as.numeric(dbGetQuery(con,"select branchId from branches where branchCohesion=-1 limit 1"))
       }else{
           branchId <- branches$branchId[branches$branchCohesion==-1][1]
       }
       cat("Branch ",branchId,":  ",sep="")
       
       ## get data and make g
       ## DATA ACCESS:
       if(useDB){
           v <- sort(as.numeric(unlist(dbGetQuery(con,paste("select vertexId from branchMembership where branchId=",branchId,sep=""))1)))
           maxId <- as.numeric(dbGetQuery(con,"select max(branchId) from branches"))
       }else{
           v <- sort(branchMembership$vertexId[branchMembership$branchId==branchId])
           maxId <- max(branches$branchId)
       }
       
       g <- subgraph(G,V(G)[cbid %in% v])
       ## check if trivial or fully connected, else treat normally
       if(vcount(g) < 3){ #trivial
           cat("Branch single or binary -- no cohesive subgroups.\n")
           k <- 0
           if(useDB){
               dbSendQuery(con,paste("update branches set branchCohesion=",k," where branchId=",branchId,";",sep=""))
               notDone <- dbGetQuery(con,"select count(*) from branches where branchCohesion=-1")>0
               #dbSendQuery(con,paste("update branches set branchCohesion=0 where branchId=",branchId,";",sep=""))
           }else{
               branches$branchCohesion[branches$branchId==branchId] <- k
               notDone <- any(branches$branchCohesion==-1)
           }
       }else if(all(degree(g)>=vcount(g))){ #fully  connected
           cat("Branch fully connected -- setting cohesion to ",infCohesion,".\n",sep="")
           k <- infCohesion
           if(useDB){
               dbSendQuery(con,paste("update branches set branchCohesion=",k," where branchId=",branchId,";",sep=""))
               notDone <- dbGetQuery(con,"select count(*) from branches where branchCohesion=-1")>0
               #dbSendQuery(con,paste("update branches set branchCohesion=",infCohesion,", isCohesiveBlock=1 where branchId=",branchId,";",sep=""))
           }else{
               branches$branchCohesion[branches$branchId==branchId] <- k
               notDone <- any(branches$branchCohesion==-1)
           }
       }else{ #keep looking
           ## find cohesion of g
           if(!is.connected(g)){
               k <- 0
           }else if(min(degree(g))==1){
               k <- 1
           }else{
               k <- graph.cohesion(g)
           }
           cat(k,"-cohesive; ",sep="")
           
           ## trim it down:
           if(useDB){
               mac <- max(k,unlist(dbGetQuery(con,paste("select maxAncestorCohesion from branches where branchId =",branchId,";"))))
           }else{
               mac <- max(k,unlist(branches$maxAncestorCohesion[branches$branchId==branchId]))
           }
           cat(length(v))
           while(any(degree(g)<=mac)){ ## trimming goes on here
               g <- subgraph(g,V(g)[degree(g)>mac])
           }
           cat("(",vcount(g),") vertices; ",sep="")
           
           ## find k-components
           ## For nastier graphs:
           ## check if cohesion after trimming is greater than k. If so, kcomp <- list(as.numeric(V(g)))
           if(vcount(g)>100 & k>1){ ## only try speedup for relatively complicated kComponents() calls
               if(!is.connected(g)){
                   newk <- 0
               }else if(min(degree(g))==1){
                   newk <- 1
               }else{
                   newk <- graph.cohesion(g)
               }
               if(newk>k){
                   kcomp <- list(as.numeric(V(g)))
               }else{
                   kcomp <- kComponents(g,k,cl)
               }
           }else{ ## otherwise just use kComponents()
               kcomp <- kComponents(g,k,cl)
           }
           
           kcomp <- lapply(kcomp,function(thisV){V(g)[thisV]$cbid})
           cat(length(kcomp)," sub-branches, ",sep="")
           ## which have been searched?
           theseHashes <- lapply(kcomp,function(x){digest(paste(paste(sort(unlist(x)),collapse=" "),mac))})
           
           
           
           
           ## DATA ACCESS:
           if(useDB){
               searched <- dbGetQuery(con,paste("select branchId, membershipHash from branches where membershipHash in ('",paste(theseHashes,collapse="','"),"');",sep=""))
           }else{
               searched <- branches[c("branchId","membershipHash")][branches$membershipHash %in% theseHashes,]
           }
           searchedCount <- nrow(searched)
           
           ## assign appropriate branchIds
           branchIds <- as.numeric(rep(NA,length(kcomp)))
           isNew <- rep(T,length(kcomp))
           if(nrow(searched)>0){
               branchIds[ind <- match(searched$membershipHash,theseHashes)] <- searched$branchId
               branchIds[-ind] <- (maxId+1):(maxId+length(branchIds)-length(ind))
               isNew[ind] <- F
           }else if(length(branchIds)>0){
               branchIds <- (maxId+1):(maxId+length(branchIds))
           }
           
           ## DATA ACCESS:
           ## add all branches to `subBranches` and only new branches to `branches` and `branchMembership`
           if(useDB){
               dbBeginTransaction(con)
               if(length(kcomp)>0){for(i in 1:length(kcomp)){
                   dbSendQuery(con,paste("insert into subBranches values(",branchId,",",branchIds[i],");",sep=""))
                   if(isNew[i]){ ## if new insert into branches branchMembership
                       dbSendQuery(con,paste("insert into branches (branchId,membershipHash,maxAncestorCohesion) values(",branchIds[i],",'",theseHashes[i],"',",mac,");",sep=""))
                       for(thisV in sort(unlist(kcompi))){
                           dbSendQuery(con,paste("insert into branchMembership values(",thisV,",",branchIds[i],"); ",sep=""))
                       }
                   }
               }}
               dbSendQuery(con,paste("update branches set branchCohesion=",k," where branchId=",branchId,";",sep=""))
               dbCommit(con)
               notDone <- dbGetQuery(con,"select count(*) from branches where branchCohesion=-1")>0
           }else{
               if(length(kcomp)>0){
                   subBranches <- rbind(subBranches,data.frame(parentId=rep(branchId,length(branchIds)),childId=branchIds))
                   branches <- rbind(branches,data.frame(branchId=branchIds,membershipHash=unlist(theseHashes),branchCohesion=-1,maxAncestorCohesion=mac,isCohesiveBlock=0)[isNew,])
                   branchMembership <- rbind(branchMembership,data.frame(vertexId=unlist(kcomp[isNew]),branchId=rep(branchIds[isNew],lapply(kcomp[isNew],length))))
               }
               
               branches$branchCohesion[branches$branchId==branchId] <- k
               notDone <- any(branches$branchCohesion==-1)
           }
           
           cat("(",searchedCount," complete);\n",sep="")
       }
   }
   
   ## mark the cohesive blocks
   if(useDB){
       dbSendQuery(con,"update branches set isCohesiveBlock=1 where maxAncestorCohesion < branchCohesion")
   }else{
       branches$isCohesiveBlock[branches$maxAncestorCohesion < branches$branchCohesion] <- 1
   }
   
   ## and extract the data we need
   if(useDB){
       cBlocks <- dbGetQuery(con,"select * from branches where isCohesiveBlock>0")
       cbMembership <- dbGetQuery(con,paste("select * from branchMembership where branchId in (",paste(cBlocks$branchId,collapse=","),");",sep=""))
       subBranches <- dbReadTable(con,"subBranches")
       lapply(dbListResults(con),dbClearResult) ## just-in-case cleanup
       dbDisconnect(con)
   }else{
       cBlocks <- branches[branches$isCohesiveBlock>0,]
       cbMembership <- branchMembership[branchMembership$branchId %in% cBlocks$branchId,]
       subBranches <- subBranches[!(is.na(subBranches$parentId) | is.na(subBranches$childId)),]
   }
   
   
   ## get block members and cohesion
   blocks <- list()
   block.cohesion <- numeric()
   for(i in cBlocks$branchId){
       blocks <- c(blocks,list(cbMembership$vertexId[cbMembership$branchId==i]))
       block.cohesion <- c(block.cohesion,cBlocks$branchCohesion[cBlocks$branchId==i])
   }
   
   ## build the block hierarchy graph. first the vertices and attributes:
   tree <- graph.empty(nrow(cBlocks))
   V(tree)$branchId <- cBlocks$branchId
   V(tree)$nodes <- blocks
   V(tree)$chnodes <- lapply(blocks,paste,collapse=",")
   V(tree)$cohesion <- block.cohesion
   ## and now the edges:
   for(v in V(tree)){
       p <- find.cohesive.parents(V(tree)[v]$branchId,cBlocks$branchId,subBranches)
       e <- rep(v,times=2*length(p))
       e[(1:length(p))*2-1] <- V(tree)[match(p,V(tree)$branchId)-1]
       tree <- add.edges(tree,e)
   }
   
   ## put it all together and return
   if(useDB){
       res <- list(graph=Gin,tree=tree,blocks=blocks,block.cohesion=block.cohesion,data=dbfile)
   }else{
       res <- list(graph=Gin,tree=tree,blocks=blocks,block.cohesion=block.cohesion,data=list(branches=branches,branchMembership=branchMembership,subBranches=subBranches))
   }
   class(res) <- "bgraph"
   return(res)

}



    1. A small helper function to identify the cohesive ##
    2. parents of a given block inside cohesive.blocks. ##

find.cohesive.parents <- function(id,blockIds,subBranches){

   res <- numeric()
   Q <- subBranches$parentId[subBranches$childId==id]
   while(length(Q)>0){
       thisId <- Q[1]
       Q <- Q[-1]
       if(is.na(thisId)) stop("NA in subBranches. something's wrong")
       if(thisId %in% blockIds){
           res <- c(res,thisId)
       }else{
           Q <- c(Q,subBranches$parentId[subBranches$childId==thisId])
       }
   }
   return(unique(res))

}

######################################################
## kCutsets uses Kanevsy's algorithm to find all    ##
## cutsets of size k in the given graph.            ##
######################################################
kCutsets <- function(g,k=NULL,cl=NULL){
   if(!is.igraph(g)){
       stop("g must be an igraph object")
   }
   if(is.null(k)){
       if(!is.connected(g)){
           k <- 0
       }else if(min(degree(g))<=1){
           k <- 1
       }else{
           k <- vertex.connectivity(g)
       }
   }
   
   if(k==0 | vcount(g) <=2) return(list(numeric()))
   
   if(k==1){ # for 1-connected graphs, use RBGL's articulationPoints()
       return(articulation.points(g)) ## function defined below, converts to graphNEL and calls articulationPoints()
   }
   
   
   
   #############
   ## begin Kanevsky's (parallel) algorithm:
   #############
   v <- as.numeric(V(g))
   
   K <- v[order(degree(g),decreasing=T)][1:k] #2a
   if(is.cutset(K,g)) res <- c(res,list(K))
   
   P <- expand.grid(K,v) #2b
   
   G <- list(g) #3
   for(i in 2:nrow(P)){
       Gi <- add.edges(g,as.numeric(t(P[1:(i-1),])))
   }
   
   if(is.null(cl)){ 
       Gbar <- lapply(G,etReduction) #4 (sequential)
       mflow <- unlist(lapply(1:nrow(P),function(i){graph.maxflow(Gbari,P[i,1]+vcount(g),P[i,2])}))
       
       Gbar <- Gbar[mflow==k] #5 (sequential)
       P <- P[mflow==k,]
       
       if(nrow(P)<1) return(list())
       GbarRes <- lapply(1:nrow(P),function(i){graph.residual(Gbari,P[i,1]+vcount(g),P[i,2])}) #6 (sequential)
       comp <- lapply(GbarRes,clusters,mode="strong")
       L <- lapply(1:length(comp),function(i){graph.shrink(GbarResi,comp=compi)})
       
       L <- lapply(L,graph.antichains,cl=cl) #7 (sequential)
       
       ## convert antichains in L to vertex sets in g
       res <- lapply(1:length(L),
           function(i){
               gc()
               l <- Li
               cmp <- compi
               thisres <- list()
               for(cn in l){
                   thisres <- unique(c(thisres,list(unique(sort((which(cmp$membership %in% cn)-1) %% vcount(g))))))
               }
               return(thisres)
           }
       )
       res <- unlist(res,recursive=F)
       
       ## and see which are cutsets
       res <- unique(res[unlist(lapply(res,function(x){length(x)==k & is.cutset(x,g)}))])
   }else{
       Gbar <- parLapply(cl,G,etReduction) #4 (parallel)
       mflow <- unlist(parLapply(cl,1:nrow(P),function(i){graph.maxflow(Gbari,P[i,1]+vcount(g),P[i,2])}))
       
       Gbar <- Gbar[mflow==k] #5 (parallel)
       P <- P[mflow==k,]
       
       if(nrow(P)<1) return(list())
       GbarRes <- parLapply(cl,1:nrow(P),function(i){graph.residual(Gbari,P[i,1]+vcount(g),P[i,2])}) #6 (parallel)
       comp <- parLapply(cl,GbarRes,clusters,mode="strong")
       L <- parLapply(cl,1:length(comp),function(i){graph.shrink(GbarResi,comp=compi)})
       
       tryL <- try(parLapply(cl,L,graph.antichains,cl=NULL),silent=T) #7 (parallel)
       if(class(tryL)=="try-error"){ ## not sure why I need this, but Moody-White example was crashing with parLapply here
           warning(paste("Parallel execution on MPI failed with message: ",tryL," Defaulted to sequential execution",sep=""))
           L <- lapply(L,graph.antichains,cl=NULL)
           cat("*")
       }else{
           L <- tryL
       }
       
       ## convert antichains in L to vertex sets in g
       res <- parLapply(cl,1:length(L),
           function(i){
             gc()
             l <- Li
               cmp <- compi
               thisres <- list()
               for(cn in l){
                   thisres <- unique(c(thisres,list(unique(sort((which(cmp$membership %in% cn)-1) %% vcount(g))))))
               }
               return(thisres)
           }
       )
       res <- unlist(res,recursive=F)
       
       ## and see which are cutsets
       res <- unique(res[unlist(parLapply(cl,res,function(x){length(x)==k & is.cutset(x,g)}))])
   }
   
   
   return(res)
}
graph.residual <- function(g,i,j){ ## NOT GENERIC: deletes edges once they get to nonpositive capacity
   gc()
   if(!("capacity" %in% list.edge.attributes(g))) E(g)$capacity <- 1
   g.r <- simplify(g)
   done <- F
   while(!done){
       p <- get.shortest.paths(g.r,i,j,mode="out")1
       if(length(p)<1) done <- T
       E(g.r,path=p)$capacity <- E(g.r,path=p)$capacity - 1 
       g.r <- delete.edges(g.r,E(g.r)[capacity<=0])
       if(vertex.disjoint.paths(g.r,i,j)==0) done <- T
   }
   return(g.r)
}
graph.shrink <- function(g,comp=clusters(g,mode="strong"),remove.multiple=T,remove.loops=T){
   gc()
   res <- graph.empty(length(comp$csize),directed=T)
   el <- get.edgelist(g)
   el.comp <- rbind(comp$membership[el[,1]+1],comp$membership[el[,2]+1])
   res <- add.edges(res,el.comp)
   res <- simplify(res,remove.multiple=remove.multiple,remove.loops=remove.loops)
   return(res)
}
graph.antichains <- function(g,cl=NULL){ ## g must be a directed acyclic graph 
#############################################################################


######################################################
## Examples of use for cohesive blocking.           ##
######################################################


#####
# 1 # Create a graph to work on:
#####
## reconstruct the example from Moody, White (2003)
mwExample <- graph.empty(23,directed=F)
V(mwExample)$label <- 1:23
mwExample <- add.edges(mwExample,
   c(1,2,
     1,3,
     1,4,
     1,5,
     1,6,
     2,3,
     2,4,
     2,5,
     2,7,
     3,4,
     3,6,
     3,7,
     4,5,
     4,6,
     4,7,
     5,6,
     5,7,
     5,21,
     6,7,
     7,8,
     7,11,
     7,14,
     7,19,
     8,9,
     8,11,
     8,14,
     9,10,
     10,12,
     10,13,
     11,12,
     11,14,
     12,16,
     13,16,
     14,15,
     15,16,
     17,18,
     17,19,
     17,20,
     18,20,
     18,21,
     19,20,
     19,22,
     19,23,
     20,21,
     21,22,
     21,23,
     22,23
   )-1)
mw <- mwExample
mwExampleFixed <- add.edges(mwExample,c(19,21)) ## add an edge to make 3-component of {17-23} as in Moody-White
mw2 <- mwExampleFixed
#####
# 2 # Run cohesive blocking algorithm:
#####
mwBlocks <- cohesive.blocks(mw)
mwBlocks2 <- cohesive.blocks(mw2)
#####
# 3 # Plot the results:
#####
plot(mwBlocks)
# if a layout algorithm takes a long time for a
# particular graph, it can be saved to the object:
mwBlocks2 <- add.kk.layout(mwBlocks2) ## kamada kawai
plot(mwBlocks2)
mwBlocks2 <- add.svd.layout(mwBlocks2,s=1) ## single value decomposition
plot(mwBlocks2)
#####
# 4 # Save as Pajek files:
#####
write.pajek.bgraph(mwBlocks,file="mwExample1")
write.pajek.bgraph(mwBlocks2,file="mwExample2")
Personal tools