Brief notes on readings #2

web.stanford.edu/class/stats364/

Jonathan Taylor

Spring 2020

Berk, Brown, Buja, Zhang & Zhao (2013)

Berk, Brown, Buja, Zhang & Zhao (2013)

Submodel parameters

Berk, Brown, Buja, Zhang & Zhao (2013)

Submodel parameters

Berk, Brown, Buja, Zhang & Zhao (2013)

Multiple comparisons problem

Berk, Brown, Buja, Zhang & Zhao (2013)

Multiple comparisons problem

Scheffe / Mahalanobis

Berk, Brown, Buja, Zhang & Zhao (2013)

Berk, Brown, Buja, Zhang & Zhao (2013)

Computation

Berk, Brown, Buja, Zhang & Zhao (2013)

Example: Boston data

library(PoSI)
library(MASS)
data(Boston)
set.seed(1)
Y = as.numeric(Boston[,14])
X = as.matrix(Boston[,-14])
PoSI_Boston = PoSI(X)
## Number of contrasts/adjusted predictors to process: 53248 
## Number of bundles: 13 
##                          Done with bundle 1 / 13    model sz = 1 
##                          Done with bundle 2 / 13    model sz = 2 
##                          Done with bundle 3 / 13    model sz = 3 
##                          Done with bundle 4 / 13    model sz = 4 
##                          Done with bundle 5 / 13    model sz = 5 
##                          Done with bundle 6 / 13    model sz = 6 
##                          Done with bundle 7 / 13    model sz = 7 
##                          Done with bundle 8 / 13    model sz = 8 
##                          Done with bundle 9 / 13    model sz = 9 
##                          Done with bundle 10 / 13    model sz = 10 
##                          Done with bundle 11 / 13    model sz = 11 
##                          Done with bundle 12 / 13    model sz = 12 
##                          Done with bundle 13 / 13    model sz = 13 
## p = 13 , d = 13   processed 53248 tests in 8191 models.  Times in seconds:
##    user  system elapsed 
##   2.556   0.566   3.465
summary(PoSI_Boston)
##     K.PoSI K.Bonferroni K.Scheffe
## 95%  3.597        4.904     4.729
## 99%  4.077        5.211     5.262

Berk, Brown, Buja, Zhang & Zhao (2013)

Example: Boston data

library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-18
cvG = cv.glmnet(X, Y)
G = glmnet(X, Y)
beta = coef(G, s=cvG$lambda.1se)[-1]
selected = which(beta != 0)
selected
## [1]  1  2  4  5  6  8 11 12 13
posi_conf = confint(lm(Y ~ X[,selected]), level=1 - 2 * pnorm(-3.597))
posi_conf
##                          0.0161 %   99.9839 %
## (Intercept)           11.85756472 47.15842931
## X[, selected]crim     -0.17121180  0.04886442
## X[, selected]zn       -0.00658954  0.09065389
## X[, selected]chas     -0.11561086  6.17545861
## X[, selected]nox     -27.79875140 -4.37827369
## X[, selected]rm        2.67285588  5.62647890
## X[, selected]dis      -2.11486443 -0.74846471
## X[, selected]ptratio  -1.26370306 -0.41357672
## X[, selected]black    -0.00144639  0.01802963
## X[, selected]lstat    -0.70015311 -0.34985515

Berk, Brown, Buja, Zhang & Zhao (2013)

Data splitting

random_order = sample(1:506, 506, replace=FALSE)
train = random_order[1:250]
test = random_order[251:506]
cvG_train = cv.glmnet(X[train,], Y[train])
G_train = glmnet(X[train,], Y[train])
beta_train = coef(G_train, s=cvG_train$lambda.1se)[-1]
selected_train = which(beta_train != 0)
selected_train
## [1]  1  4  5  6  8 11 12 13
split_conf = confint(lm(Y ~ X[,selected_train], subset=test))
split_conf
##                                   2.5 %      97.5 %
## (Intercept)                  5.73093716 31.03918720
## X[, selected_train]crim     -0.12134596  0.05437917
## X[, selected_train]chas      0.34990827  5.09444004
## X[, selected_train]nox     -20.25303704 -3.11906246
## X[, selected_train]rm        4.24010553  6.35597613
## X[, selected_train]dis      -1.37067478 -0.46688988
## X[, selected_train]ptratio  -1.21807244 -0.62144636
## X[, selected_train]black     0.00320544  0.01712368
## X[, selected_train]lstat    -0.59742776 -0.33269668

Berk, Brown, Buja, Zhang & Zhao (2013)

Data splitting

posi_comp = confint(lm(Y ~ X[,selected_train]), level=1 - 2 * pnorm(-3.597))
data.frame(split=split_conf[,2]-split_conf[,1], posi=posi_comp[,2]-posi_comp[,1])
##                                  split        posi
## (Intercept)                25.30825004 35.60486348
## X[, selected_train]crim     0.17572512  0.21928657
## X[, selected_train]chas     4.74453177  6.34562364
## X[, selected_train]nox     17.13397458 23.62464331
## X[, selected_train]rm       2.11587060  2.94811328
## X[, selected_train]dis      0.90378490  1.19676252
## X[, selected_train]ptratio  0.59662608  0.81820182
## X[, selected_train]black    0.01391824  0.01964244
## X[, selected_train]lstat    0.26473108  0.35334960

Berk, Brown, Buja, Zhang & Zhao (2013)

Best case

Worst case

Taylor (2018)

split_conf
##                                   2.5 %      97.5 %
## (Intercept)                  5.73093716 31.03918720
## X[, selected_train]crim     -0.12134596  0.05437917
## X[, selected_train]chas      0.34990827  5.09444004
## X[, selected_train]nox     -20.25303704 -3.11906246
## X[, selected_train]rm        4.24010553  6.35597613
## X[, selected_train]dis      -1.37067478 -0.46688988
## X[, selected_train]ptratio  -1.21807244 -0.62144636
## X[, selected_train]black     0.00320544  0.01712368
## X[, selected_train]lstat    -0.59742776 -0.33269668

Taylor (2018)

Taylor (2018)

Taylor (2018)

Taylor (2018)