Numerical tolerance for spectral decompositions of random matrices

Department of Applied Mathematics and Statistics
Center for Imaging Science
Johns Hopkins University
and
Paradigm, Inc.
Yale University
and
University of Maryland
and
North Carolina State University


Avanti Athreya, Michael Kane, Bryan Lewis, Zachary Lubberts, Vince Lyzinski, Youngser Park, Carey E. Priebe, Minh Tang, “Numerical tolerance for spectral decompositions of random matrices,” submitted, 2020.

Abstract

We precisely quantify the impact of statistical error in the quality of a numerical approximation to a random matrix eigendecomposition, and under mild conditions, we use this to introduce an optimal numerical tolerance for residual error in spectral decompositions of random matrices. We demonstrate that terminating an eigendecomposition algorithm when the numerical error and statistical error are of the same order results in computational savings with no loss of accuracy. We also repair a flaw in a ubiquitous termination condition, one in wide employ in several computational linear algebra implementations. We illustrate the practical consequences of our stopping criterion with an analysis of simulated and real networks. Our theoretical results and real-data examples establish that the tradeoff between statistical and numerical error is of significant import for data science.

Codes and Experiments

Simulation (run this on cis machine!)

procrustes <- function(X,Y, type = "I"){
    if(type == "C"){
        X <- X/norm(X, type = "F")*sqrt(nrow(X))
        Y <- Y*norm(X, type = "F")*sqrt(nrow(Y))
    }
    if(type == "D"){
        tX <- rowSums(X^2)
        tX[tX <= 1e-15] <- 1
        tY <- rowSums(Y^2)
        tY[tY <= 1e-15] <- 1
        X <- X/sqrt(tX)
        Y <- Y/sqrt(tY)
    }

    tmp <- t(X)%*%Y
    tmp.svd <- svd(tmp)
    W <- tmp.svd$u %*% t(tmp.svd$v)
    err <- norm(X%*%W - Y, type = "F")
    return(list(W=W,err=err))
}

bb <- 10
#bb <- seq(200,250,by=10) 
B <- cbind( c(.5, .2,.2), c(.2,.5,.2),c(.2,.2, .5) ) / bb
m <- nrow(B)
bvec <- rep(300,m)
N <- sum(bvec); #rho <- bvec/N

rR <- rep(1:m,times=bvec)
P <- B[rR,rR]
U <- irlba(P,nu=m,nv=m,tol=1e-06)$u # default is 1e-05, use 1e-06 instead.

nmc <- 50
kvec <- seq(1,20)
err <- iter <- matrix(0,nmc,length(kvec))
for (mc in 1:nmc) {
    set.seed(124+mc)
    g <- sbm.game(N,B,bvec)#; summary(g)

    ind <- 1
    for (k in 1:length(kvec)) {
        out <- irlba(g[],nu=m,nv=m,tol=2^-kvec[k])
        iter[mc,ind] <- out$iter
        Uhat <- out$u
        err[mc,ind] <- procrustes(U,Uhat)$err
        ind <- ind+1
    }
}
err.mean <- colMeans(err)
err.sd <- apply(err,2,sd) / sqrt(nmc)

df1 <- data.frame(k=kvec, mean=colMeans(err), err=apply(err,2,sd)/sqrt(nmc), type="error")
df2 <- data.frame(k=kvec, mean=colMeans(iter), err=apply(iter,2,sd)/sqrt(nmc), type="iteration")
df <- rbind(df1,df2)

bi <- 3000
b <- seq(150,250,by=10)

out <- foreach (i=b) %dopar% {
    B <- cbind( c(.5,.2,.2), c(.2,.5,.2),c(.2,.2,.5) ) / i
    m <- nrow(B)
    bvec <- rep(bi,m)
    N <- sum(bvec); #rho <- bvec/N

    rR <- rep(1:m,times=bvec)
    P <- B[rR,rR]
    U <- irlba(P,nu=m,nv=m,tol=1e-06)$u

    nmc <- 50
    kvec <- seq(1,20)
    err <- iter <- matrix(0,nmc,length(kvec))
    for (mc in 1:nmc) {
        set.seed(124+mc)
        g <- sbm.game(N,B,bvec)#; summary(g)

        ind <- 1
        for (k in 1:length(kvec)) {
            out <- irlba(g[],nu=m,nv=m,tol=2^-kvec[k])
            iter[mc,ind] <- out$iter
            Uhat <- out$u
            err[mc,ind] <- procrustes(U,Uhat)$err
            ind <- ind+1
        }
    }
    err.mean <- colMeans(err)
    err.sd <- apply(err,2,sd) / sqrt(nmc)

    df1 <- data.frame(k=kvec, mean=colMeans(err), err=apply(err,2,sd)/sqrt(nmc), type="err")
    df2 <- data.frame(k=kvec, mean=colMeans(iter), err=apply(iter,2,sd)/sqrt(nmc), type="iter")
    df <- rbind(df1,df2)
#    save(df, file=paste0("~/Dropbox/Tolerance/df-n9k-b",i,".Rbin"))
}

Youtube Example

https://snap.stanford.edu/data/com-Youtube.html

Youtube is a video-sharing web site that includes a social network. In the Youtube social network, users form friendship each other and users can create groups which other users can join. We consider such user-defined groups as ground-truth communities. This data is provided by Alan Mislove et al.

We regard each connected component in a group as a separate ground-truth community. We remove the ground-truth communities which have less than 3 nodes. We also provide the top 5,000 communities with highest quality which are described in our paper. As for the network, we provide the largest connected component.

Nodes 1134890
Edges 2987624

Number of communities 8,385
Average community size 13.50
Average membership size 0.10

  1. The graph is a strongly connected.
  2. irlba is used.
  3. ZG yields \(\hat{d} = 26\).
  4. pamk on \(\hat{X}=n\times \hat{d}\) yields \(\hat{K} = 4\).
  5. \(ari_1\): \(ari(Y_0,Y_k)\) compares the clustering of \(\hat{X}\) into \(\hat{K}\) with \(pamk(\hat{X}',\hat{K})\) with \(tol=2^{-k}\).
  6. \(ari_2\): \(ari(Y_{k-1},Y_k)\) compares a clustering with \(tol=2^{-(k-1)}\) with a clustering with \(tol=2^{-k}\) for \(k>1\).
  7. nmc=10 over each tolerance \(k\) was performed and average values of each measures are reported.
require(data.table)
dt <- fread(input = 'zcat < com-youtube.ungraph.txt.gz')

require(igraph)
system.time(g <- graph_from_edgelist(as.matrix(dt), directed = FALSE))
cl <- igraph::clusters(g)
lcc <- induced.subgraph(g, which(cl$membership == which.max(cl$csize)))

require(irlba)
m <- 50
system.time(out <- irlba(lcc[],nu=m,nv=m,tol=1e-06))
str(out)

elb <- dim_select(out$d)
U <- out$u[,1:elb]

require(fpc)
system.time(pamkout <- pamk(U, krange=2:10, usepam=FALSE))
(Khat <- pamkout$nc) 
system.time(pout <- pamk(U,Khat,usepam=FALSE))
memb <- pout$pamobject$clustering

require(mclust)
nmc2 <- 10
kvec <- seq(1,20)

iter.mean <- ari1.mean <- ari2.mean <- etime.mean <- rep(0,length(kvec))
iter.sd <- ari1.sd <- ari2.sd <- etime.sd <- rep(0,length(kvec))

for (i in 1:20) {
    set.seed(123+i) 
    cat("i = ", i, "\n")
    ii <- tt <- aa1 <- aa2 <- 0
    tmp <- foreach (j=1:nmc2, .combine='rbind') %dopar% {
        cat("   j = ", j, "\n")
        ptm <- proc.time()[3]
        out <- irlba(lcc[],elb,elb,tol=2^-kvec[i])
        ii <- out$iter
        tt <- proc.time()[3] - ptm
        pout <- pamk(out$u[,1:elb],Khat,usepam=FALSE)
        cl <- pout$pamobject$clustering
        if (j==1) {cl.old <- cl}; #save(cl.old,file="cl6-old.Rbin")}
        aa1 <- adjustedRandIndex(cl,memb)
        if (i>1) {
#            load("cl6-old.Rbin")
            aa2 <- adjustedRandIndex(cl,cl.old)
        }
        ppp <- c(ii,tt,aa1,aa2)
        ppp
    }
    print(format(tmp,digits=2))
    iter.mean[i] <- mean(tmp[,1]); iter.sd[i] <- sd(tmp[,1]) / sqrt(nmc2)
    etime.mean[i] <- mean(tmp[,2]); etime.sd[i] <- sd(tmp[,2]) / sqrt(nmc2)
    ari1.mean[i] <- mean(tmp[,3]); ari1.sd[i] <- sd(tmp[,3]) / sqrt(nmc2)
    ari2.mean[i] <- mean(tmp[,4]); ari2.sd[i] <- sd(tmp[,4]) / sqrt(nmc2)
    cat("i =",i, ", iter =", iter.mean[i],", ari =",ari1.mean[i],", ari2 =",ari2.mean[i],", etime =", etime.mean[i],"\n")
}
# save.image(file="tolerance-youtube-1e-06.Rbin")