Selection through a black box

web.stanford.edu/class/stats364/

Jonathan Taylor

Spring 2020

Selection through a black box

General selection mechanisms

Selection through a black box

By now we should now that…

Selection through a black box

What do we really need to do this?

A simple idea

How to draw samples?

Selection through a black box

A simple example: drop the losers

def winning_arm(*samples):
    return np.argmax([np.mean(sample) for sample in samples])
truth = [1,0.9,1.1]
samples = [np.random.standard_normal(20) + truth[0], 
           np.random.standard_normal(20) + truth[1],
           np.random.standard_normal(20) + truth[2]]
i_star = winning_arm(*samples)
followup = np.random.standard_normal(20) + truth[i_star]
i_star
## 0

Selection through a black box

A simple example: drop the losers

np.random.seed(2)
def generate_learning_data(i_star,
                           samples,
                           followup,
                           winning_arm=winning_arm,
                           B=5000):
    samples_cp = copy(samples)
    Y, T = [], []
    n_star = len(samples_cp[i_star])
    full_data = np.hstack([samples_cp[i_star], followup])
    for b in range(B):
        full_data_star = np.random.choice(full_data, full_data.shape[0], replace=True)
        samples_cp[i_star] = np.random.choice(full_data_star, n_star, replace=False)
        Y.append(winning_arm(*samples_cp) == i_star)
        T.append(full_data_star.mean())
    return np.array(Y), np.array(T).reshape((-1,1))

Y, T = generate_learning_data(i_star, samples, followup)
Y.shape, T.shape
## ((5000,), (5000, 1))

Fit the probability learner

from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(random_state=0)
clf.fit(T, Y);

Selection through a black box

A simple example: drop the losers

Bootstrap may not capture full range

Exercise

  1. What should this curve look like?

  2. Assume \(i^*=1\). Denote What would happen if we instead learnt selection as a function of \((\bar{D}_1, \bar{D}_{1,1})\) instead of only \(\bar{D}_1\)? Can you use such a function to do the correct adjustment?

import matplotlib.pyplot as plt
# %matplotlib inline
pi_hat = clf.predict_proba(T)
plt.scatter(T[:,0], pi_hat[:,1])

Selection through a black box

How to vary range of \(\bar{D}_{i}\) for \(i=i^*\)?

def generate_learning_data_frac(i_star, 
                                samples, 
                                followup, 
                                winning_arm=winning_arm,
                                B=5000, 
                                frac=0.6):
    samples_cp = copy(samples)
    Y, T = [], []
    n_star = len(samples_cp[i_star])
    full_data = np.hstack([samples_cp[i_star], followup])
    for b in range(B):
        full_data_star = np.random.choice(full_data,
                                          int(frac * full_data.shape[0]),
                                          replace=True)
        samples_cp[i_star] = np.random.choice(full_data_star, n_star, replace=False)
        Y.append(winning_arm(*samples_cp) == i_star)
        T.append(full_data_star.mean())
    return np.array(Y), np.array(T).reshape((-1,1))
Y2, T2 = generate_learning_data_frac(i_star, samples, followup)
clf2 = GradientBoostingClassifier(random_state=0)
clf2.fit(T2, Y2);
pi_hat2 = clf2.predict_proba(T2)
plt.scatter(T2[:,0], pi_hat2[:,1])

T3, Y3 = np.vstack([T, T2]), np.hstack([Y, Y2])
clf3 = GradientBoostingClassifier(random_state=0)
clf3.fit(T3, Y3);
pi_hat3 = clf3.predict_proba(T3)
plt.scatter(T3[:,0], pi_hat3[:,1])

Y4, T4 = generate_learning_data_frac(i_star, samples, followup, frac=0.7, B=5000)
T4, Y4 = np.vstack([T4, T3]), np.hstack([Y4, Y3])
clf4 = GradientBoostingClassifier(random_state=0)
clf4.fit(T4, Y4);
pi_hat4 = clf4.predict_proba(T4)
plt.scatter(T4[:,0], pi_hat4[:,1])

def pvalue(full_data_mean, 
           se, 
           pi_hat, 
           B=10000,
           truth=1):
    # test H_a: mean in winning arm is > truth
    Z = np.random.standard_normal(B) * se + truth
    W = pi_hat(Z)
    sf_ = np.sum(W * (Z > full_data_mean)) / np.sum(W)
    return sf_

full_data_winning = np.hstack([samples[i_star], followup])
pvalue(full_data_winning.mean(), 
       full_data_winning.std() / np.sqrt(full_data_winning.shape[0]),
       lambda t: clf4.predict_proba(t.reshape((-1,1)))[:,1])
## 0.6218785858108452
def simulate(sizes=[20, 20, 20],
             size_followup=20,
             fracs=[1,0.6,0.7],
             truth=[0.9, 1, 1.1],
             winning_arm=winning_arm):
    
    # run our trial
    
    samples = [np.random.standard_normal(size) + truth[i] for i, size in enumerate(sizes)]
    i_star = winning_arm(*samples)
    followup = np.random.standard_normal(size_followup) + truth[i_star]
    
    # generate our learning data
    
    Y, T = [], []
    for frac in fracs:
        y, t = generate_learning_data_frac(i_star, samples, followup, frac=frac)
        Y.append(y); T.append(t)
    Y = np.hstack(Y)
    T = np.vstack(T)
    
    # learn the selection probability
    
    clf = GradientBoostingClassifier(random_state=0)
    clf.fit(T, Y)
    
    # use selection probability for inference
    
    full_data_winning = np.hstack([samples[i_star], followup])
    pi_hat = lambda t: clf.predict_proba(t.reshape((-1,1)))[:,1]
    return pvalue(full_data_winning.mean(), 
                  full_data_winning.std() / np.sqrt(full_data_winning.shape[0]),
                  pi_hat,
                  truth=truth[i_star])
simulate()
## 0.2442717377465183
P = []
for _ in range(50):
    P.append(simulate())
pivot_plot(P);

Selection through a black box

Can we just bootstrap all the data?

def generate_learning_data_boot(i_star,
                                samples,
                                followup,
                                winning_arm=winning_arm,
                                B=5000,
                                frac=0.7):

    samples_cp = copy(samples)
    Y, T = [], []
    n_star = len(samples_cp[i_star])
    full_data = np.hstack([samples[i_star], followup])
    for b in range(B):
        full_data_star = np.random.choice(full_data, int(frac * full_data.shape[0]), replace=True)
        samples_cp[i_star] = np.random.choice(full_data_star, n_star, replace=False)
        for j in range(len(samples)):
            if j != i_star:
                samples_cp[j] = np.random.choice(samples[j], int(frac * samples[j].shape[0]), replace=True)
        Y.append(winning_arm(*samples_cp) == i_star)
        T.append([full_data_star.mean()] + [samples_cp[j].mean() for j in range(len(samples)) if j != i_star])
    return np.array(Y), np.array(T)

Y, T = generate_learning_data_boot(i_star, samples, followup)
T.shape
## (5000, 3)
def simulate_boot(sizes=[20, 20, 20],
                  size_followup=20,
                  fracs=[1,0.6,0.7],
                  truth=[0.9, 1, 1.1],
                  winning_arm=winning_arm):
    
    # run our trial
    
    samples = [np.random.standard_normal(size) + truth[i] for i, size in enumerate(sizes)]
    i_star = winning_arm(*samples)
    followup = np.random.standard_normal(size_followup) + truth[i_star]
    
    # generate our learning data
    
    Y, T = [], []
    for frac in fracs:
        y, t = generate_learning_data_boot(i_star, samples, followup, frac=frac)
        Y.append(y); T.append(t)
    Y = np.hstack(Y)
    T = np.vstack(T)
    
    # learn the selection probability
    
    clf = GradientBoostingClassifier(random_state=0)
    clf.fit(T, Y)
    
    # use selection probability for inference
    
    other_means = [samples[j].mean() for j in range(len(samples)) if j != i_star]
    full_data_winning = np.hstack([samples[i_star], followup])
    
    def pi_hat(t):
        full_t = np.zeros((t.shape[0], len(samples)))
        full_t[:,0] = t
        full_t[:,1:] = other_means
        return clf.predict_proba(full_t)[:,1]

    return pvalue(full_data_winning.mean(), 
                  full_data_winning.std() / np.sqrt(full_data_winning.shape[0]),
                  pi_hat,
                  truth=truth[i_star])

simulate_boot()
## 0.32429574652109217
P = []
for _ in range(50):
    P.append(simulate_boot())
    

Selection through a black box

Does it depend heavily on our selection?

# center them around 0
for sample in samples:
    sample -= 1
    
def winning_arm_abs(*samples):
    return np.argmax([np.fabs(np.mean(sample)) for sample in samples])
i_star_abs = winning_arm_abs(*samples)
Y4, T4 = generate_learning_data_frac(i_star_abs, samples, followup, frac=0.6, B=25000,
                                     winning_arm=winning_arm_abs)
clf4 = GradientBoostingClassifier(random_state=0)
clf4.fit(T4, Y4);
pi_hat4 = clf4.predict_proba(T4)
plt.scatter(T4[:,0], pi_hat4[:,1])
## <matplotlib.collections.PathCollection object at 0x1c2efbb8d0>
P = []
for _ in range(50):
    P.append(simulate(winning_arm=winning_arm_abs))
pivot_plot(P);

Selection through a black box

A three-stage trial

stage1_samples = [np.random.standard_normal(20) + 1 for _ in range(5)]
def stage1(*samples):
    return np.argsort([np.mean(sample) for sample in samples])[-3:]
stage1_winners = stage1(*stage1_samples)
stage1_winners
## array([2, 0, 4])
stage2_samples = [np.hstack([np.random.standard_normal(20) + 1, 
                             stage1_samples[i]]) for i in stage1_winners]
def stage2(*samples):
    return np.argmax([np.mean(sample) for sample in samples])
stage2_winner = stage2(*samples)
overall_winner = stage1_winners[stage2_winner]
stage3_sample = np.hstack([stage2_samples[stage2_winner],
                           np.random.standard_normal(20) + 1])
overall_winner
## 2

Generating learning data

def generate_learning_multi(stage1_samples,
                            stage1_winners,
                            stage2_samples,
                            stage2_winner,
                            overall_winner,
                            stage3_sample,
                            frac=0.7,
                            B=5000):
    stage2_samples_cp = copy(stage2_samples)
    stage1_samples_cp = copy(stage1_samples)
    
    Y, T = [], []
    
    for _ in range(B):
        overall_winner_star = np.random.choice(stage3_sample, int(frac * stage3_sample.shape[0]),
                                               replace=True)
        stage2_samples_cp[stage2_winner] = np.random.choice(overall_winner_star,
                                                            stage2_samples[stage2_winner].shape[0],
                                                            replace=False)
        stage1_samples_cp[overall_winner] = np.random.choice(stage2_samples_cp[stage2_winner],
                                                             stage1_samples_cp[overall_winner].shape[0],
                                                             replace=False)
        
        stage1_winners_star = stage1(*stage1_samples_cp)
        if overall_winner in stage1_winners_star:
            stage2_winner_star = stage2(*stage2_samples_cp)
            overall_winner_ = stage1_winners_star[stage2_winner_star]
            y = overall_winner_ == overall_winner
        else:
            y = 0
        Y.append(y)
        T.append(overall_winner_star.mean())
    return np.array(Y), np.array(T).reshape((-1,1))
      
Y, T = generate_learning_multi(stage1_samples,
                               stage1_winners,
                               stage2_samples,
                               stage2_winner,
                               overall_winner,
                               stage3_sample)
clf = GradientBoostingClassifier(random_state=0)
clf.fit(T, Y);
pi_hat = clf4.predict_proba(T)
plt.scatter(T[:,0], pi_hat[:,1])
## <matplotlib.collections.PathCollection object at 0x1c2db09e50>
def simulate_multi(stage1_size=[20]*10,
                   stage2_size=[20]*3,
                   stage3_size=20,
                   fracs=[1,0.7,0.8],
                   truth=[1]*10):
    
    # run our trial
    
    stage1_samples = [np.random.standard_normal(size) + truth[i] for i, size in enumerate(stage1_size)]
    stage1_winners = stage1(*stage1_samples)
    
    stage2_samples = [np.hstack([np.random.standard_normal(size) + truth[i], 
                      stage1_samples[i]]) for i, size in zip(stage1_winners, stage2_size)]
    stage2_winner = stage2(*samples)
    overall_winner = stage1_winners[stage2_winner]
    stage3_sample = np.hstack([stage2_samples[stage2_winner],
                               np.random.standard_normal(stage3_size) + truth[overall_winner]])
    
    # generate our learning data
    
    Y, T = [], []
    for frac in fracs:
        y, t = generate_learning_multi(stage1_samples,
                                       stage1_winners,
                                       stage2_samples,
                                       stage2_winner,
                                       overall_winner,
                                       stage3_sample,
                                       frac=frac)
        Y.append(y); T.append(t)
    Y = np.hstack(Y)
    T = np.vstack(T)
    
    # learn the selection probability
    
    clf = GradientBoostingClassifier(random_state=0)
    clf.fit(T, Y)
    
    # use selection probability for inference
    
    pi_hat = lambda t: clf.predict_proba(t.reshape((-1,1)))[:,1]
    return pvalue(stage3_sample.mean(), 
                  stage3_sample.std() / np.sqrt(stage3_sample.shape[0]),
                  pi_hat,
                  truth=truth[overall_winner])

simulate_multi()
## 0.9980666296040841
P = []
for _ in range(50):
    P.append(simulate_multi())
pivot_plot(P);

Selection through a black box

Is this a black box?

Selection through a black box

Regression

Selection through a black box

Selection through a black box

Selection through a black box

Covariance / precision

Exercise

  1. For \(n=100, p=20\) and \(\Theta=I\), try the above algorithm to construct hypothesis tests for edges selected by the graphical LASSO at a fixed value of the regularization parameter.

  2. Describe how you can use a learnt \(\hat{\pi}\) function to construct confidence intervals for a selected element of \(\Theta\). Try this out on the graphical LASSO above.