PFAB algorithm

Matt Moores

2018-06-06

A new version 0.5-0 of my R package bayesImageS is now available on CRAN. To accompany it is a revision to my paper with Kerrie and Tony, “Scalable Bayesian inference for the inverse temperature of a hidden Potts model.” (Moores, Pettitt & Mengersen, arXiv:1503.08066v2). This paper introduces the parametric functional approximate Bayesian (PFAB) algorithm (the ‘p’ is silent…), which is a form of Bayesian indirect likelihood (BIL).

PFAB splits computation into 3 stages:

  1. Simulation for fixed \(\beta\) using Swendsen-Wang
  2. Fitting a parametric surrogate model using Stan
  3. Approximate posterior inference using Metropolis-within-Gibbs

Stage 1

For Stage 1, I used 2000 iterations of SW for each of 72 values of \(\beta\), but this is really overkill for most applications. I chose 72 values because I happened to have a 36-core, hyperthreaded CPU available. Here I’ll just be running everything on my laptop (an i7 CPU with 4 hyperthreaded cores), so 28 values should be plenty. The idea is to have higher density closer to the critical temperature, where the variance (and hence the gradient of the score function) is greatest

For our precomputation step, we need to know the image dimensions \(n\) and the number of labels \(k\) that we will use for pixel classification. We’ll be using the Lake of Menteith dataset from Bayesian Essentials with R (Marin & Robert, 2014):

library(bayess)
data("Menteith")
iter <- 800
burn <- iter/4 + 1
n <- prod(dim(Menteith))
k <- 6
image(as.matrix(Menteith),
      asp=1,xaxt='n',yaxt='n',col=gray(0:255/255))
Lake of Menteith, Scotland

Lake of Menteith, Scotland

The precomputation step is usually the most expensive part, but for 100x100 pixels it should only take around 15 seconds:

library(bayesImageS)
bcrit <- log(1 + sqrt(k))
beta <- sort(c(seq(0,1,by=0.1),seq(1.05,1.15,by=0.05),bcrit-0.05,bcrit-0.02,bcrit+0.02,
            seq(1.3,1.4,by=0.05),seq(1.5,2,by=0.1),2.5,3))
mask <- matrix(1, nrow=sqrt(n), ncol=sqrt(n))
neigh <- getNeighbors(mask, c(2,2,0,0))
block <- getBlocks(mask, 2)
edges <- getEdges(mask, c(2,2,0,0))
maxS <- nrow(edges)
E0 <- maxS/k
V0 <- maxS*(1/k)*(1 - 1/k)

Embarrasingly parallel, using all available CPU cores:

library(doParallel)
cores <- min(detectCores(), length(beta))
print(paste("Parallel computation using",cores,"CPU cores:",
            iter,"iterations for",length(beta),"values of beta."))
cl <- makeCluster(cores)
print(cl)
clusterSetRNGStream(cl)
registerDoParallel(cl)

Simulate from the prior to verify the critical value of \(\beta\):

tm <- system.time(matu <- foreach(i=1:length(beta),
          .packages=c("bayesImageS"), .combine='cbind') %dopar% {
  res <- swNoData(beta[i],k,neigh,block,iter)
  res$sum
})
stopCluster(cl)
##    user  system elapsed 
##   0.185   0.151  19.384

This shows the piecewise linear approximation that we used in our first paper (Moores et al., STCO 2015):

plot(x=beta, y=colMeans(simSW$matu),main="",
     xlab=expression(beta),ylab=expression(S(z)))
lrcst=approxfun(beta,colMeans(simSW$matu))
curve(lrcst,0,max(beta),add=T,col="blue")
abline(v=bcrit,col="red",lty=3)
abline(h=maxS,col=2,lty=2)
points(0,E0,col=2,pch=2)
Piecewise linear approximation

Piecewise linear approximation

Stage 2

Instead, for Stage 2 we will use Stan to fit a parametric integral curve:

functions {
  vector ft(vector t, real tC, real e0, real ecrit, real v0, real vmaxLo, real vmaxHi, real phi1, real phi2) {
    vector[num_elements(t)] mu;
    real sqrtBcritPhi = sqrt(tC)*phi1;
    for (i in 1:num_elements(t)) {
      if (t[i] <= tC) {
        real sqrtBdiffPhi = sqrt(tC - t[i])*phi1;
        mu[i] = e0 + t[i]*v0 - ((2*(vmaxLo-v0))/(phi1^2))*((sqrtBcritPhi + 1)/exp(sqrtBcritPhi) - (sqrtBdiffPhi + 1)/exp(sqrtBdiffPhi));
      } else {
        real sqrtBdiff = sqrt(t[i] - tC);
        mu[i] = ecrit - ((2*vmaxHi)/phi2)*(sqrtBdiff/exp(phi2*sqrtBdiff) + (exp(-phi2*sqrtBdiff) - 1)/phi2);
      }
    }
    return mu;
  }

  vector dfdt(vector t, real tC, real v0, real vmaxLo, real vmaxHi, real r1, real r2) {
    vector[num_elements(t)] dmu;
    for (i in 1:num_elements(t)) {
      if (t[i] <= tC) {
        dmu[i] = v0 + (vmaxLo-v0)*exp(-r1*sqrt(tC - t[i]));
      } else {
        dmu[i] = vmaxHi*exp(-r2*sqrt(t[i] - tC));
      }
    }
    return dmu;
  }
}
data {
  int<lower = 1> M;
  int<lower = 1> N;
  real<lower = 1> maxY;
  real<lower = 1> Vlim;
  real<lower = 0> e0;
  real<lower = 0> v0;
  real tcrit;
  matrix<lower=0, upper=maxY>[M,N] y;
  vector[M] t;
}
parameters {
  real<lower = 0> a;
  real<lower = 0> b;
  real<lower = e0, upper=maxY> ecrit;
  real<lower = 0, upper=Vlim> vmaxLo;
  real<lower = 0, upper=Vlim> vmaxHi;
}
transformed parameters {
  vector[M] curr_mu;
  vector[M] curr_var;
  curr_mu = ft(t, tcrit, e0, ecrit, v0, vmaxLo, vmaxHi, a, b);
  curr_var = dfdt(t, tcrit, v0, vmaxLo, vmaxHi, a, b);
}
model {
  for (i in 1:M) {
    y[i,] ~ normal(curr_mu[i], sqrt(curr_var[i]));
  }
}

For comparison, see a previous blog post where I fitted a simple, logistic curve using Stan.

library(rstan)
options(mc.cores = min(4, parallel::detectCores()))
dat <- list(M=length(beta), N=iter-burn+1, maxY=maxS, e0=E0, v0=V0,
            Vlim=2*maxS*log(maxS)/pi, tcrit=bcrit, y=t(simSW$matu[burn:iter,]), t=beta)
tm2 <- system.time(fit <- sampling(PFAB, data = dat, verbose=TRUE, iter=5000,
                            control = list(adapt_delta = 0.9, max_treedepth=20)))
##     user   system  elapsed 
##    6.032    7.794 2140.408
library(knitr)
kable(fitStan$summary, caption="95% highest posterior density intervals for pseudo-Voigt peaks",
      align = 'lrrrrrr', digits=3)
95% highest posterior density intervals for pseudo-Voigt peaks
mean se_mean sd 2.5% 97.5% n_eff Rhat
a 4.546 0.000 0.005 4.535 4.556 3018.587 1.001
b 6.674 0.000 0.002 6.671 6.678 5517.964 1.000
ecrit 14210.513 0.042 2.996 14204.672 14216.457 5209.181 1.000
vmaxLo 58862.196 3.464 189.405 58489.777 59232.045 2989.964 1.001
vmaxHi 124677.101 0.599 30.050 124595.047 124706.591 2519.701 1.001

Roughly 30 minutes to fit the surrogate model makes this the most expensive step, but only because the first step was so fast. For much larger images (a megapixel or more), it will be the other way around.

ft <- function(t, tC, e0, ecrit, v0, vmax1, vmax2, phi1, phi2) {
  sqrtBcritPhi = sqrt(tC)*phi1
  fval <- numeric(length(t))
  for (i in 1:length(t)) {
    if (t[i] <= tC) {
      sqrtBdiffPhi = sqrt(tC - t[i])*phi1
      fval[i] <- e0 + t[i]*v0 - ((2*(vmax1-v0))/(phi1^2))*((sqrtBcritPhi + 1)/exp(sqrtBcritPhi) - (sqrtBdiffPhi + 1)/exp(sqrtBdiffPhi));
    } else {
      sqrtBdiff = sqrt(t[i] - tC)
      fval[i] <- ecrit - ((2*vmax2)/phi2)*(sqrtBdiff/exp(phi2*sqrtBdiff) + (exp(-phi2*sqrtBdiff) - 1)/phi2);
    }
  }
  return(fval)
}

plot(range(beta),range(simSW$matu),type='n',xlab=expression(beta),ylab=expression(S(z)))
idx <- burn+sample.int(iter-burn+1,size=20)
abline(v=bcrit,col="red",lty=3)
abline(h=maxS,col=2,lty=2)
points(rep(beta,each=20),simSW$matu[idx,],pch=20)
lines(beta, ft(beta, bcrit, E0, 14237, V0, 59019, 124668, 4.556, 6.691),
      col=4, lwd=2)

To really see how well this approximation fits the true model, we need to look at the residuals:

residMx <- matrix(nrow=iter-burn+1, ncol=length(beta))
for (b in 1:length(beta)) {
  residMx[,b] <- simSW$matu[burn:iter,b] - ft(beta[b], bcrit, E0, 14237, V0, 59019, 124668, 4.556, 6.691)
}

dfdt <- function(t, tC, V0, Vmax1, Vmax2, r1, r2) {
  ifelse(t < tC,
         V0 + (Vmax1-V0)*exp(-r1*sqrt(tC - t)),
         Vmax2*exp(-r2*sqrt(t - tC)))
}

plot(range(beta),range(residMx),type='n',xlab=expression(beta),ylab="residuals")
abline(h=0,lty=2,col=4,lwd=2)
points(rep(beta,each=iter-burn+1),residMx,pch='.',cex=3)

x <- sort(c(seq(0,3,by=0.01),bcrit))
lines(x, 3*sqrt(dfdt(x, bcrit, V0, 59019, 124668, 4.556, 6.691)), col=2, lwd=2)
lines(x, -3*sqrt(dfdt(x, bcrit, V0, 59019, 124668, 4.556, 6.691)), col=2, lwd=2)

This shows that 28 values of \(\beta\) were enough to obtain a high-quality fit between the true model and the surrogate. The location of \(\beta_{crit}\) is slightly off, because \(100 \times 100\) pixels isn’t quite enough for the asymptotics to kick in.

Stage 3

Now that we have our surrogate model, we can proceed to the final stage, which is to perform image segmentation using mcmcPotts:

mh <- list(algorithm="aux", bandwidth=0.02, Vmax1=59019,
           Vmax2=124668, E0=E0, Ecrit=14237, phi1=4.556,
           phi2=6.691, factor=1, bcrit=bcrit, V0=V0)
priors <- list()
priors$k <- k
priors$mu <- c(0, 50, 100, 150, 200, 250)
priors$mu.sd <- rep(10,k)
priors$sigma <- rep(20,k)
priors$sigma.nu <- rep(5, k)
priors$beta <- c(0,3)
iter <- 1e4
burn <- iter/2
y <- as.vector(as.matrix(Menteith))
print(date())
## [1] "Wed Jun  6 16:26:22 2018"
tm3 <- system.time(resPFAB <- 
             mcmcPotts(y,neigh,block,priors,mh,iter,burn))
## 10000 iterations of aux discarding the first 5000 as burn-in.
print(tm3)
##    user  system elapsed 
##  21.205   0.077  21.315

Now we compare with the approximate exchange algorithm:

  mh <- list(algorithm="ex", bandwidth=0.02, auxiliary=200)
  tm4 <- system.time(resAEA <- 
                       mcmcPotts(y,neigh,block,priors,mh,iter,burn))
  resAEA$time <- tm4[3]
##  elapsed 
## 2306.565

Over 200 times speedup, in comparison to AEA. There is reasonably good agreement between the posterior distributions:

densPFAB <- density(resPFAB$beta[burn:iter])
densAEA <- density(resAEA$beta[burn:iter])
plot(densAEA, col=4, lty=2, lwd=2, main="", xlab=expression(beta),
     xlim=range(resPFAB$beta[burn:iter],resAEA$beta[burn:iter]))
lines(densPFAB, col=2, lty=3, lwd=3)
abline(h=0,lty=2)
legend("topright",legend=c("AEA","PFAB"),col=c(4,2),lty=c(2,3),
       lwd=3)