Inference under MVN with missing data

Author

Dr. Alexander Fisher

Example: precision medicine

This example is from Hoff ch. 7.

Load libraries and data.

library(tidyverse)
library(mvtnorm)
library(monomvn)
library(coda)
Y = read_csv("https://sta360-fa23.github.io/data/Pima.csv") %>%
  as.matrix() 
colnames(Y) = NULL

This data set contains

  • glu blood plasma glucose concentration
  • bp diastolic blood pressure
  • skin skin fold thickness
  • bmi body mass index

for 200 women of Pima Indian heritage living near Phoenix, Arizona (Smith et al, 1988). Some observations are missing.

Inference using Gibbs sampling

Setup prior parameters and starting values.

## prior parameters
n = nrow(Y); p = ncol(Y)
# prior on theta
mu0 = c(120, 64, 26, 26); sd0 = (mu0 / 2)
L0 = matrix(.1, p, p)
diag(L0) = 1 
L0 = L0 * outer(sd0, sd0) # \Lambda_0
# prior on Sigma
nu0 = p + 2; 
S0 = L0
###

### starting values
Sigma = S0
Y.full = Y
O = 1 * (!is.na(Y)) # indices for observe values of Y

for(j in 1:p) {
  Y.full[is.na(Y.full[,j]),j]<-mean(Y.full[,j],na.rm=TRUE)
} 

Exercise: what does the code above set the starting values of \(\theta, \Sigma\) and \(\textbf{Y}_{\text{mis}}\) to?

The Gibbs sampler.

### Gibbs sampler
THETA <- SIGMA <- Y.MISS <- NULL
set.seed(360)

for(s in 1:1000) {

  ###update theta
  ybar <- apply(Y.full, 2 , mean)
  Ln <- solve(solve(L0) + n * solve(Sigma))
  mun <- Ln %*% (solve(L0) %*% mu0 + n * solve(Sigma) %*% ybar)
  theta <- rmvnorm(1, mun, Ln)
  ###
  
  ###update Sigma
  Sn <- S0 + (t(Y.full) - c(theta)) %*% t(t(Y.full) - c(theta))
  Sigma <- rwish(nu0 + n, solve(Sn))
  ###
  
  ###update missing data
  for(i in 1:n) { 
    b <- (O[i, ] == 0)
    a <- (O[i, ] == 1)
    if( sum(b) != 0) {
    iSa <- solve(Sigma[a, a])
    beta.j <- Sigma[b, a] %*% iSa
    s2.j   <- Sigma[b, b] - Sigma[b, a] %*% iSa %*% Sigma[a, b]
    theta.j <- theta[b] + beta.j %*% (as.matrix(Y.full[i, a]) - theta[a])
    Y.full[i, b] <- rmvnorm(1, theta.j, s2.j)
    }
  }
  
  ### save results
  THETA<-rbind(THETA,theta) ; SIGMA<-rbind(SIGMA,c(Sigma))
  Y.MISS<-rbind(Y.MISS, Y.full[O==0] )
  ###

  if(s %% 250 == 0 | s == 1) {
  cat(s,theta,"\n")
  }
}
1 129.6031 71.66437 28.97947 31.385 
250 123.6105 70.56862 29.10075 30.97481 
500 123.6067 70.36211 29.20791 31.03185 
750 123.6179 70.78825 29.29216 30.90945 
1000 123.6017 70.15369 29.22685 31.08064 
#### Posterior mean
apply(THETA,2,mean)
[1] 123.60189  70.57085  29.16132  31.79352

Effective sample size of THETA

# effective sample size of THETA 
apply(THETA, 2, effectiveSize)
[1]  879.4569 3680.4806 4411.2164 7721.7664