This notebook demonstrates the knockoff construction for the asymmetric Markov Chain experiments. The data is generated as follows. Let \[ Z_j\stackrel{i.i.d.}{\sim} \frac{I\cdot \mid Y_\text{G} \mid -(1-I)\cdot Y_\text{E}- \mu}{\sigma} \ \text{ for } j=1,\dots,p, \] where \(\mu\) and \(\sigma\) are chosen so that the variables are centered with unit variance, and then define \[ X_1=Z_1, \quad X_{j+1}=\rho_j X_j + \sqrt{1-\rho_j^2}Z_{j+1} \ \text{ for } j=2,\dots,p. \] Section 5.2.2 of the accompanying paper presents simulation results in this setting.

Multiple-try Metropolis

We first demonstrate the multiple-try metropolis technique (Section 3.3 of the paper) with the recommended settings.

# simulation parameters
p = 30  # dimension of the random vector X
n_samples = 300  # number of repititions
rhos = rep(0.6, p - 1)  # correlations in the generative distribution

# algorithm parameters
halfnumtry = 4  # m/half number of candidates
stepsize = 1.5  # step size in the unit of 1/\sqrt((\Sigma)^{-1}_{jj})

# mean/variance constants
asymean = 1/sqrt(2 * pi) - 1/2
variance = 1.5 - (1/sqrt(2 * pi) - 1/2)^2
# configure parallelism
library(foreach)
library(doMC)
n_cores = 3
registerDoMC(cores = n_cores)
# additional packages
library(latex2exp)
library(ggplot2)

source("../asymmetric/asym_MTM_core.R")  #load the knockoff functions

First, we compute the stepsize of the grid of proposals for each variable. The recommended stepsize for variable \(j\) (Section 3.3 of the paper) is \[ 1.5 / \sqrt{(\Sigma^{-1})_{jj}}. \]

# compute the proposal size at each step using an exact analytic calculation
quantile.x = matrix(0, ncol = 2 * halfnumtry + 1, nrow = p)
sds = rep(0, p)
sds[1] = sqrt(1 - rhos[1]^2)
for (i in 2:(p - 1)) {
    sds[i] = sqrt((1 - rhos[i - 1]^2) * (1 - rhos[i]^2)/(1 - rhos[i - 1]^2 * 
        rhos[i]^2))
}
sds[p] = sqrt(1 - rhos[p - 1]^2)
for (i in 1:p) {
    quantile.x[i, ] = ((-halfnumtry):halfnumtry) * sds[i] * stepsize
}

Then, we sample the Markov chain and the knockoffs with the SCEP.MH.MC function.

# sample the variable and generate the knockoffs
temp = matrix(0, ncol = 2 * p, nrow = 1)
start = Sys.time()
result_mtm = foreach(i = 1:n_samples, .combine = "rbind") %dopar% {
    # sample the MC
    if (runif(1) >= 0.5) {
        temp[1] = (abs(rnorm(1)) - asymean)/sqrt(variance)
    } else {
        temp[1] = (-rexp(1) - asymean)/sqrt(variance)
    }
    for (j in 2:p) {
        if (runif(1) >= 0.5) {
            temp[j] = ((abs(rnorm(1)) - asymean)/sqrt(variance)) * sqrt(1 - 
                rhos[j - 1]^2) + rhos[j - 1] * temp[j - 1]
        } else {
            temp[j] = ((-rexp(1) - asymean)/sqrt(variance)) * sqrt(1 - rhos[j - 
                1]^2) + rhos[j - 1] * temp[j - 1]
        }
    }
    
    # generate the knockoffs
    temp[(p + 1):(2 * p)] = SCEP.MH.MC(temp[1:p], 0.999, quantile.x)
    temp
}
end = Sys.time()
print(paste0("Average time per observation + knockoff (seconds): ", round((end - 
    start) * n_cores/n_samples, digits = 3)))
[1] "Average time per observation + knockoff (seconds): 0.012"
dim(result_mtm)
[1] 300  60

Knockoff quality

We can compute the mean correlation between \(X_j\) and \(\tilde{X}_j\) to measure the knockoff quality:

X = result_mtm[, 1:p]
Xk = result_mtm[, (p + 1):(2 * p)]

cors = c()
for (j in 1:p) {
    cors = c(cors, cor(X[, j], Xk[, j]))
}
mean(cors)
[1] 0.5857757

We can also plot the correlation between \(X_j\) and \(\tilde{X}_j\) across different coordinates:

# plot the correlation versus the variable index
qplot(1:p, cors) + theme_bw() + xlab("variable index") + ylab(TeX("cor($X_j$, $\\tilde{X}_j$)")) + 
    scale_y_continuous(limits = c(0, 1))

We see that the first and last knockoff have higher quality, since these variables only depend on one other variable.

Diagnostics: checking means and covariances

As a basic diagnostic, we check that each coordinate is mean zero and the the empirical covariance matrices of \(X\) and \(\tilde{X}\) are close. As the number of samples increases, the empirical covariance of \(X\) and \(\tilde{X}\) will converge to the same population covariance.

max(abs(colMeans(X)))  #find the coordinate with mean farthest away from 0
[1] 0.1029638
max(abs(colMeans(Xk)))  #find the coordinate with mean farthest away from 0
[1] 0.1294849
S = cor(X)  #correlation matrix of X
Sk = cor(Xk)  #correlation matrix of Xk
max(abs(S - Sk))  #largest difference in the correlation matrix.
[1] 0.1865868

We can visualize the difference in the empirical covariance matrices of \(X\) and \(\tilde{X}\) with a heatmap.

# visualize the difference in the empirical covariance
library(ggcorrplot)
ggcorrplot(cor(X) - cor(Xk), ggtheme = ggplot2::theme_bw, legend.title = "Corr diff", 
    title = "Difference in correlation structure") + scale_y_reverse()

We see that the difference in the empirical correlation matrix of \(X\) and the empirical correlation matrix of \(\tilde{X}\) are small, as expected, since \(X\) and \(\tilde{X}\) come from the same distribution.

Covariance-guided proposals

We can also generate knockoffs with the covariance-guided proposals (Section 3.2 of the paper) with the SCEP.MH.MC.COV function. These proposals are motivated by a Gaussian approximation (indeed, they are optimal for Gaussian distributions), but they produce exact knockoffs because of the Metropolis–Hastings correction.

n_samples = 1000
set.seed(100)

temp = matrix(0, ncol = 2 * p, nrow = 1)
start = Sys.time()
result_cov = foreach(i = 1:n_samples, .combine = "rbind", .packages = "MASS") %do% 
    {
        # sample the MC
        if (runif(1) >= 0.5) {
            temp[1] = (abs(rnorm(1)) - asymean)/sqrt(variance)
        } else {
            temp[1] = (-rexp(1) - asymean)/sqrt(variance)
        }
        for (j in 2:p) {
            if (runif(1) >= 0.5) {
                temp[j] = ((abs(rnorm(1)) - asymean)/sqrt(variance)) * sqrt(1 - 
                  rhos[j - 1]^2) + rhos[j - 1] * temp[j - 1]
            } else {
                temp[j] = ((-rexp(1) - asymean)/sqrt(variance)) * sqrt(1 - rhos[j - 
                  1]^2) + rhos[j - 1] * temp[j - 1]
            }
        }
        
        # generate the knockoffs
        xk = SCEP.MH.MC.COV(temp[1:p], 1, rep(0, p), prop.mat, cond.means.coeff, 
            cond.vars)
        temp[(p + 1):(2 * p)] = xk[1:p]
        temp
    }
end = Sys.time()
print(paste0("Average time per observation + knockoff (seconds): ", round((end - 
    start)/n_samples, digits = 3)))
[1] "Average time per observation + knockoff (seconds): 0.005"
dim(result_cov)
[1] 1000   60

Knockoff quality

We can compute the mean correlation between \(X_j\) and \(\tilde{X}_j\) to measure the knockoff quality:

X = result_cov[, 1:p]
Xk = result_cov[, (p + 1):(2 * p)]

cors = c()
for (j in 1:p) {
    cors = c(cors, cor(result_mtm[, j], result_mtm[, j + p]))
}
mean(cors)
[1] 0.5857757

We can also plot the correlation between \(X_j\) and \(\tilde{X}_j\) across different coordinates:

# plot the correlation versus the variable index
qplot(1:p, cors) + theme_bw() + xlab("variable index") + ylab(TeX("cor($X_j$, $\\tilde{X}_j$)")) + 
    scale_y_continuous(limits = c(0, 1))

We see that the first and last knockoff have higher quality, since these variables only depend on one other variable.

Diagnostics: checking means and covariances

As a basic diagnostic, we check that each coordinate is mean zero and the the empirical covariance matrices of \(X\) and \(\tilde{X}\) are close. As the number of samples increases, the empirical covariance of \(X\) and \(\tilde{X}\) will converge to the same population covariance.

max(abs(colMeans(X)))  #find the coordinate with mean farthest away from 0
[1] 0.07067053
max(abs(colMeans(Xk)))  #find the coordinate with mean farthest away from 0
[1] 0.08482318
S = cor(X)  #correlation matrix of X
Sk = cor(Xk)  #correlation matrix of Xk
max(abs(S - Sk))  #largest difference in the correlation matrix.
[1] 0.1208887

We can visualize the difference in the empirical covariance matrices of \(X\) and \(\tilde{X}\) with a heatmap.

# visualize the difference in the empirical covariance
library(ggcorrplot)
ggcorrplot(cor(X) - cor(Xk), ggtheme = ggplot2::theme_bw, legend.title = "Corr diff", 
    title = "Difference in correlation structure") + scale_y_reverse()

We see that the difference in the empirical correlation matrix of \(X\) and the empirical correlation matrix of \(\tilde{X}\) are small, as expected, since \(X\) and \(\tilde{X}\) come from the same distribution.