itdata_normrankplot.r

#! /usr/bin/Rscript
###
### Exploratory analysis of real data - InstaTwit
### For 2,4,6,8,10 seeds, calculates the average
### proportion of the time that i.voi is matched to t.voi 
### plots the average number of times i.voi is matched correctly along
### with error-bars. 
### goes through all possible sets of s seeds taken from the seed set -- 
### always including center seed
###
### Heather Gaddy Patsolic 
### 2015 <[email protected]>
### S.D.G 
#


require(raster)
require(igraph)
require(clue)
require(foreach)
require(ggplot2)
require(doMC)
require(VN)
source("vntmp.r")

## User: If false, loads rdata file. If true, runs full script.
run <- FALSE


if(run){
    h <- 2 # set the size of the neighborhood to look at
    ell <- 2
    g <- 0.1
    R <- 30
    iter <- 20

    load("itdat.RData")
    
#    t.nvoi <- setdiff(1:nrow(twitadj),c(ssrm[,1],t.voi))
#    i.nvoi <- setdiff(1:nrow(instadj),c(ssrm[,2],i.voi))
#    twitadj <- twitadj[c(ssrm[,1],t.voi,t.nvoi),c(ssrm[,1],t.voi,t.nvoi)]
#    instadj <- instadj[c(ssrm[,2],i.voi,i.nvoi),c(ssrm[,2],i.voi,i.nvoi)]
#    twitgraph <- graph_from_adjacency_matrix(twitadj,mode="undirected")
#    instagraph <- graph_from_adjacency_matrix(instadj,mode="undirected")
    
    avgnom <- rep(NA,5)
    cilower <- rep(NA,5)
    ciupper <- rep(NA,5)
    L <- list()
    
    set.seed(56781)
    for(index in 1:5){
        ss1 <- combn(1:9,2*index-1)
        ssind <- rbind(ss1,10)
        
        kindex <- ncol(ssind)
        # keep track of location of i.voi in nomination list
        sumlocivoi <- rep(NA,kindex)
    
        ## no need to save nbd size. always same because all vertices are 1
        ## away from 8th vertex, which is always in seedset
    
        for(aind in 1:kindex){
            ssr <- S[ssind[,aind],]
            
#           tim <- localnbd(11,ssr,twitgraph,instagraph,2,2,R,g,sim=TRUE,verb=FALSE)
            tim <- vntmp(t.voi,ssr,twitgraph,instagraph,h,ell,R,g,plotF=FALSE)

            ranked <- pass2ranksuplus(tim$P[1:nrow(tim$map1),1:nrow(tim$map2)])

            v1 <- which(tim$map1[,1]==t.voi)
            v2 <- which(tim$map2[,1]==i.voi)
            tim$P[v1,v2]
            sortvoi <- ranked[v1,v2]
            normrank <- (sortvoi - 1)/(nrow(tim$map2)-nrow(tim$seedsused)-1)

            sumlocivoi[aind] <- normrank
        }
        xbar <- sum(sumlocivoi)/kindex
        sterr <- sd(sumlocivoi)/sqrt(kindex)
        cilower[index] <- xbar - 2*sterr
        ciupper[index] <- xbar + 2*sterr
        avgnom[index] <- xbar
        L[[index]] <- list(seeds=ssind, normrank=sumlocivoi)
    }
}else{
    load("instatwitNomListing.RData")
}

x<-c(2,4,6,8,10)

CI <- cbind(cilower,ciupper) # for 95% CI, use 1.96 instead of 2
CI[5,] <- 0

df <- data.frame(x,avgnom,CI)
colnames(df) <- c("x", "avgnom", "CIL", "CIU")

p01 <- ggplot(data=df, aes(x=x, y = avgnom)) + 
        geom_line()  + 
        geom_point() + 
        theme(axis.line.x = element_line(colour="black"),
              axis.line.y = element_line(colour="black")) +
        geom_errorbar(aes(ymin=CIL, ymax=CIU)) + #,colour="salmon") + 
        scale_x_continuous(breaks=c(2,4,6,8,10),labels=c(2,4,6,8,10)) + 
        ylab(expression(paste(tau,"(x')"))) + 
        xlab(expression(s[x])) +
        theme(axis.text=element_text(size=20),
    plot.title=element_text(size=20),
        axis.title=element_text(size=20,face="bold"))+
        labs(title= expression(paste("Effects of ", s[x], ": Instagram and Twitter")))
               #tau(x), " matching Instagram to Twitter")))


#pdf("instatwit_permall.pdf",height=4,width=6)
p01

#dev.off()



## Determining which seeds give optimal performance.
is0 <- lapply(L,function(x){which(x$normrank==0)})
optseed <- lapply(L,function(x){x$seeds[,which(x$normrank==0)]})
is9there <- unlist(lapply(optseed,function(x){sum(x==9)}))
cols <- unlist(lapply(optseed,function(x){ifelse(!is.vector(x),ncol(x),1)}))
cols-is9there

## [1] 0 0 0 0 0

ischance <- unlist(lapply(L,function(x){sum(x$normrank>=0.5)}))
extremes <- ischance + unlist(lapply(is0,function(x){length(x)}))
s <- seq(1,9,by=2)
total <- apply(as.matrix(s),1,function(x){choose(9,x)})

all(total==extremes)

## [1] TRUE

print("Seeds required for perfect matching are 8 and 9.") #

## [1] "Seeds required for perfect matching are 8 and 9."

save.image("instatwitNomListing.RData")

#   Time:
##  Working status:
### Comments:
####Soli Deo Gloria

itdata_normrankplot.r

HGP

Mon Dec 17 17:04:26 2018