#! /usr/bin/Rscript
###
### Exploratory analysis of real data - InstaTwit
### For 2,4,6,8,10 seeds, calculates the average
### proportion of the time that i.voi is matched to t.voi
### plots the average number of times i.voi is matched correctly along
### with error-bars.
### goes through all possible sets of s seeds taken from the seed set --
### always including center seed
###
### Heather Gaddy Patsolic
### 2015 <[email protected]>
### S.D.G
#
require(raster)
require(igraph)
require(clue)
require(foreach)
require(ggplot2)
require(doMC)
require(VN)
source("vntmp.r")
## User: If false, loads rdata file. If true, runs full script.
run <- FALSE
if(run){
h <- 2 # set the size of the neighborhood to look at
ell <- 2
g <- 0.1
R <- 30
iter <- 20
load("itdat.RData")
# t.nvoi <- setdiff(1:nrow(twitadj),c(ssrm[,1],t.voi))
# i.nvoi <- setdiff(1:nrow(instadj),c(ssrm[,2],i.voi))
# twitadj <- twitadj[c(ssrm[,1],t.voi,t.nvoi),c(ssrm[,1],t.voi,t.nvoi)]
# instadj <- instadj[c(ssrm[,2],i.voi,i.nvoi),c(ssrm[,2],i.voi,i.nvoi)]
# twitgraph <- graph_from_adjacency_matrix(twitadj,mode="undirected")
# instagraph <- graph_from_adjacency_matrix(instadj,mode="undirected")
avgnom <- rep(NA,5)
cilower <- rep(NA,5)
ciupper <- rep(NA,5)
L <- list()
set.seed(56781)
for(index in 1:5){
ss1 <- combn(1:9,2*index-1)
ssind <- rbind(ss1,10)
kindex <- ncol(ssind)
# keep track of location of i.voi in nomination list
sumlocivoi <- rep(NA,kindex)
## no need to save nbd size. always same because all vertices are 1
## away from 8th vertex, which is always in seedset
for(aind in 1:kindex){
ssr <- S[ssind[,aind],]
# tim <- localnbd(11,ssr,twitgraph,instagraph,2,2,R,g,sim=TRUE,verb=FALSE)
tim <- vntmp(t.voi,ssr,twitgraph,instagraph,h,ell,R,g,plotF=FALSE)
ranked <- pass2ranksuplus(tim$P[1:nrow(tim$map1),1:nrow(tim$map2)])
v1 <- which(tim$map1[,1]==t.voi)
v2 <- which(tim$map2[,1]==i.voi)
tim$P[v1,v2]
sortvoi <- ranked[v1,v2]
normrank <- (sortvoi - 1)/(nrow(tim$map2)-nrow(tim$seedsused)-1)
sumlocivoi[aind] <- normrank
}
xbar <- sum(sumlocivoi)/kindex
sterr <- sd(sumlocivoi)/sqrt(kindex)
cilower[index] <- xbar - 2*sterr
ciupper[index] <- xbar + 2*sterr
avgnom[index] <- xbar
L[[index]] <- list(seeds=ssind, normrank=sumlocivoi)
}
}else{
load("instatwitNomListing.RData")
}
x<-c(2,4,6,8,10)
CI <- cbind(cilower,ciupper) # for 95% CI, use 1.96 instead of 2
CI[5,] <- 0
df <- data.frame(x,avgnom,CI)
colnames(df) <- c("x", "avgnom", "CIL", "CIU")
p01 <- ggplot(data=df, aes(x=x, y = avgnom)) +
geom_line() +
geom_point() +
theme(axis.line.x = element_line(colour="black"),
axis.line.y = element_line(colour="black")) +
geom_errorbar(aes(ymin=CIL, ymax=CIU)) + #,colour="salmon") +
scale_x_continuous(breaks=c(2,4,6,8,10),labels=c(2,4,6,8,10)) +
ylab(expression(paste(tau,"(x')"))) +
xlab(expression(s[x])) +
theme(axis.text=element_text(size=20),
plot.title=element_text(size=20),
axis.title=element_text(size=20,face="bold"))+
labs(title= expression(paste("Effects of ", s[x], ": Instagram and Twitter")))
#tau(x), " matching Instagram to Twitter")))
#pdf("instatwit_permall.pdf",height=4,width=6)
p01
#dev.off()
## Determining which seeds give optimal performance.
is0 <- lapply(L,function(x){which(x$normrank==0)})
optseed <- lapply(L,function(x){x$seeds[,which(x$normrank==0)]})
is9there <- unlist(lapply(optseed,function(x){sum(x==9)}))
cols <- unlist(lapply(optseed,function(x){ifelse(!is.vector(x),ncol(x),1)}))
cols-is9there
## [1] 0 0 0 0 0
ischance <- unlist(lapply(L,function(x){sum(x$normrank>=0.5)}))
extremes <- ischance + unlist(lapply(is0,function(x){length(x)}))
s <- seq(1,9,by=2)
total <- apply(as.matrix(s),1,function(x){choose(9,x)})
all(total==extremes)
## [1] TRUE
print("Seeds required for perfect matching are 8 and 9.") #
## [1] "Seeds required for perfect matching are 8 and 9."
save.image("instatwitNomListing.RData")
# Time:
## Working status:
### Comments:
####Soli Deo Gloria