Department of Applied Mathematics and Statistics
Center for Imaging Science
Johns Hopkins University
and
Paradigm, Inc.
Yale University
and
University of Maryland
and
North Carolina State University
Avanti Athreya, Michael Kane, Bryan Lewis, Zachary Lubberts, Vince Lyzinski, Youngser Park, Carey E. Priebe, Minh Tang, “Numerical tolerance for spectral decompositions of random matrices,” submitted, 2020.
We precisely quantify the impact of statistical error in the quality of a numerical approximation to a random matrix eigendecomposition, and under mild conditions, we use this to introduce an optimal numerical tolerance for residual error in spectral decompositions of random matrices. We demonstrate that terminating an eigendecomposition algorithm when the numerical error and statistical error are of the same order results in computational savings with no loss of accuracy. We also repair a flaw in a ubiquitous termination condition, one in wide employ in several computational linear algebra implementations. We illustrate the practical consequences of our stopping criterion with an analysis of simulated and real networks. Our theoretical results and real-data examples establish that the tradeoff between statistical and numerical error is of significant import for data science.
procrustes <- function(X,Y, type = "I"){
if(type == "C"){
X <- X/norm(X, type = "F")*sqrt(nrow(X))
Y <- Y*norm(X, type = "F")*sqrt(nrow(Y))
}
if(type == "D"){
tX <- rowSums(X^2)
tX[tX <= 1e-15] <- 1
tY <- rowSums(Y^2)
tY[tY <= 1e-15] <- 1
X <- X/sqrt(tX)
Y <- Y/sqrt(tY)
}
tmp <- t(X)%*%Y
tmp.svd <- svd(tmp)
W <- tmp.svd$u %*% t(tmp.svd$v)
err <- norm(X%*%W - Y, type = "F")
return(list(W=W,err=err))
}
bb <- 10
#bb <- seq(200,250,by=10)
B <- cbind( c(.5, .2,.2), c(.2,.5,.2),c(.2,.2, .5) ) / bb
m <- nrow(B)
bvec <- rep(300,m)
N <- sum(bvec); #rho <- bvec/N
rR <- rep(1:m,times=bvec)
P <- B[rR,rR]
U <- irlba(P,nu=m,nv=m,tol=1e-06)$u # default is 1e-05, use 1e-06 instead.
nmc <- 50
kvec <- seq(1,20)
err <- iter <- matrix(0,nmc,length(kvec))
for (mc in 1:nmc) {
set.seed(124+mc)
g <- sbm.game(N,B,bvec)#; summary(g)
ind <- 1
for (k in 1:length(kvec)) {
out <- irlba(g[],nu=m,nv=m,tol=2^-kvec[k])
iter[mc,ind] <- out$iter
Uhat <- out$u
err[mc,ind] <- procrustes(U,Uhat)$err
ind <- ind+1
}
}
err.mean <- colMeans(err)
err.sd <- apply(err,2,sd) / sqrt(nmc)
df1 <- data.frame(k=kvec, mean=colMeans(err), err=apply(err,2,sd)/sqrt(nmc), type="error")
df2 <- data.frame(k=kvec, mean=colMeans(iter), err=apply(iter,2,sd)/sqrt(nmc), type="iteration")
df <- rbind(df1,df2)
bi <- 3000
b <- seq(150,250,by=10)
out <- foreach (i=b) %dopar% {
B <- cbind( c(.5,.2,.2), c(.2,.5,.2),c(.2,.2,.5) ) / i
m <- nrow(B)
bvec <- rep(bi,m)
N <- sum(bvec); #rho <- bvec/N
rR <- rep(1:m,times=bvec)
P <- B[rR,rR]
U <- irlba(P,nu=m,nv=m,tol=1e-06)$u
nmc <- 50
kvec <- seq(1,20)
err <- iter <- matrix(0,nmc,length(kvec))
for (mc in 1:nmc) {
set.seed(124+mc)
g <- sbm.game(N,B,bvec)#; summary(g)
ind <- 1
for (k in 1:length(kvec)) {
out <- irlba(g[],nu=m,nv=m,tol=2^-kvec[k])
iter[mc,ind] <- out$iter
Uhat <- out$u
err[mc,ind] <- procrustes(U,Uhat)$err
ind <- ind+1
}
}
err.mean <- colMeans(err)
err.sd <- apply(err,2,sd) / sqrt(nmc)
df1 <- data.frame(k=kvec, mean=colMeans(err), err=apply(err,2,sd)/sqrt(nmc), type="err")
df2 <- data.frame(k=kvec, mean=colMeans(iter), err=apply(iter,2,sd)/sqrt(nmc), type="iter")
df <- rbind(df1,df2)
# save(df, file=paste0("~/Dropbox/Tolerance/df-n9k-b",i,".Rbin"))
}
https://snap.stanford.edu/data/com-Youtube.html
Youtube is a video-sharing web site that includes a social network. In the Youtube social network, users form friendship each other and users can create groups which other users can join. We consider such user-defined groups as ground-truth communities. This data is provided by Alan Mislove et al.
We regard each connected component in a group as a separate ground-truth community. We remove the ground-truth communities which have less than 3 nodes. We also provide the top 5,000 communities with highest quality which are described in our paper. As for the network, we provide the largest connected component.
Nodes 1134890
Edges 2987624
Number of communities 8,385
Average community size 13.50
Average membership size 0.10
irlba
is used.pamk
on \(\hat{X}=n\times \hat{d}\) yields \(\hat{K} = 4\).nmc=10
over each tolerance \(k\) was performed and average values of each measures are reported.require(data.table)
dt <- fread(input = 'zcat < com-youtube.ungraph.txt.gz')
require(igraph)
system.time(g <- graph_from_edgelist(as.matrix(dt), directed = FALSE))
cl <- igraph::clusters(g)
lcc <- induced.subgraph(g, which(cl$membership == which.max(cl$csize)))
require(irlba)
m <- 50
system.time(out <- irlba(lcc[],nu=m,nv=m,tol=1e-06))
str(out)
elb <- dim_select(out$d)
U <- out$u[,1:elb]
require(fpc)
system.time(pamkout <- pamk(U, krange=2:10, usepam=FALSE))
(Khat <- pamkout$nc)
system.time(pout <- pamk(U,Khat,usepam=FALSE))
memb <- pout$pamobject$clustering
require(mclust)
nmc2 <- 10
kvec <- seq(1,20)
iter.mean <- ari1.mean <- ari2.mean <- etime.mean <- rep(0,length(kvec))
iter.sd <- ari1.sd <- ari2.sd <- etime.sd <- rep(0,length(kvec))
for (i in 1:20) {
set.seed(123+i)
cat("i = ", i, "\n")
ii <- tt <- aa1 <- aa2 <- 0
tmp <- foreach (j=1:nmc2, .combine='rbind') %dopar% {
cat(" j = ", j, "\n")
ptm <- proc.time()[3]
out <- irlba(lcc[],elb,elb,tol=2^-kvec[i])
ii <- out$iter
tt <- proc.time()[3] - ptm
pout <- pamk(out$u[,1:elb],Khat,usepam=FALSE)
cl <- pout$pamobject$clustering
if (j==1) {cl.old <- cl}; #save(cl.old,file="cl6-old.Rbin")}
aa1 <- adjustedRandIndex(cl,memb)
if (i>1) {
# load("cl6-old.Rbin")
aa2 <- adjustedRandIndex(cl,cl.old)
}
ppp <- c(ii,tt,aa1,aa2)
ppp
}
print(format(tmp,digits=2))
iter.mean[i] <- mean(tmp[,1]); iter.sd[i] <- sd(tmp[,1]) / sqrt(nmc2)
etime.mean[i] <- mean(tmp[,2]); etime.sd[i] <- sd(tmp[,2]) / sqrt(nmc2)
ari1.mean[i] <- mean(tmp[,3]); ari1.sd[i] <- sd(tmp[,3]) / sqrt(nmc2)
ari2.mean[i] <- mean(tmp[,4]); ari2.sd[i] <- sd(tmp[,4]) / sqrt(nmc2)
cat("i =",i, ", iter =", iter.mean[i],", ari =",ari1.mean[i],", ari2 =",ari2.mean[i],", etime =", etime.mean[i],"\n")
}
# save.image(file="tolerance-youtube-1e-06.Rbin")