Lab: Logistic Regression, LDA, QDA, and KNN
Contents
Lab: Logistic Regression, LDA, QDA, and KNN#
The Stock Market Data#
library(ISLR)
names(Smarket)
dim(Smarket)
summary(Smarket)
pairs(Smarket)
- 'Year'
- 'Lag1'
- 'Lag2'
- 'Lag3'
- 'Lag4'
- 'Lag5'
- 'Volume'
- 'Today'
- 'Direction'
- 1250
- 9
Year Lag1 Lag2 Lag3
Min. :2001 Min. :-4.922000 Min. :-4.922000 Min. :-4.922000
1st Qu.:2002 1st Qu.:-0.639500 1st Qu.:-0.639500 1st Qu.:-0.640000
Median :2003 Median : 0.039000 Median : 0.039000 Median : 0.038500
Mean :2003 Mean : 0.003834 Mean : 0.003919 Mean : 0.001716
3rd Qu.:2004 3rd Qu.: 0.596750 3rd Qu.: 0.596750 3rd Qu.: 0.596750
Max. :2005 Max. : 5.733000 Max. : 5.733000 Max. : 5.733000
Lag4 Lag5 Volume Today
Min. :-4.922000 Min. :-4.92200 Min. :0.3561 Min. :-4.922000
1st Qu.:-0.640000 1st Qu.:-0.64000 1st Qu.:1.2574 1st Qu.:-0.639500
Median : 0.038500 Median : 0.03850 Median :1.4229 Median : 0.038500
Mean : 0.001636 Mean : 0.00561 Mean :1.4783 Mean : 0.003138
3rd Qu.: 0.596750 3rd Qu.: 0.59700 3rd Qu.:1.6417 3rd Qu.: 0.596750
Max. : 5.733000 Max. : 5.73300 Max. :3.1525 Max. : 5.733000
Direction
Down:602
Up :648
cor(Smarket)
cor(Smarket[,-9])
Error in cor(Smarket): 'x' must be numeric
Traceback:
1. cor(Smarket)
2. stop("'x' must be numeric")
attach(Smarket)
plot(Volume)
Logistic Regression#
glm.fits=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
data=Smarket,family=binomial)
summary(glm.fits)
Call:
glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
Volume, family = binomial, data = Smarket)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.446 -1.203 1.065 1.145 1.326
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.126000 0.240736 -0.523 0.601
Lag1 -0.073074 0.050167 -1.457 0.145
Lag2 -0.042301 0.050086 -0.845 0.398
Lag3 0.011085 0.049939 0.222 0.824
Lag4 0.009359 0.049974 0.187 0.851
Lag5 0.010313 0.049511 0.208 0.835
Volume 0.135441 0.158360 0.855 0.392
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1731.2 on 1249 degrees of freedom
Residual deviance: 1727.6 on 1243 degrees of freedom
AIC: 1741.6
Number of Fisher Scoring iterations: 3
coef(glm.fits)
summary(glm.fits)$coef
- (Intercept)
- -0.12600025655927
- Lag1
- -0.0730737458900263
- Lag2
- -0.0423013440073083
- Lag3
- 0.0110851083796763
- Lag4
- 0.0093589383702788
- Lag5
- 0.0103130684758178
- Volume
- 0.135440658859162
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| (Intercept) | -0.126000257 | 0.24073574 | -0.5233966 | 0.6006983 |
| Lag1 | -0.073073746 | 0.05016739 | -1.4565986 | 0.1452272 |
| Lag2 | -0.042301344 | 0.05008605 | -0.8445733 | 0.3983491 |
| Lag3 | 0.011085108 | 0.04993854 | 0.2219750 | 0.8243333 |
| Lag4 | 0.009358938 | 0.04997413 | 0.1872757 | 0.8514445 |
| Lag5 | 0.010313068 | 0.04951146 | 0.2082966 | 0.8349974 |
| Volume | 0.135440659 | 0.15835970 | 0.8552723 | 0.3924004 |
glm.probs=predict(glm.fits,type="response")
glm.probs[1:10]
contrasts(Direction)
- 1
- 0.507084133395401
- 2
- 0.481467878454591
- 3
- 0.481138835214201
- 4
- 0.515222355813022
- 5
- 0.510781162691538
- 6
- 0.506956460534911
- 7
- 0.492650874187038
- 8
- 0.509229158207377
- 9
- 0.517613526170958
- 10
- 0.488837779771376
| Up | |
|---|---|
| Down | 0 |
| Up | 1 |
glm.pred = rep("Down", 1250)
glm.pred[glm.probs>.5] = "Up"
table(glm.pred,Direction)
(507+145)/1250
mean(glm.pred==Direction)
Direction
glm.pred Down Up
Down 145 141
Up 457 507
0.5216
0.5216
train = (Year < 2005)
Smarket.2005 = Smarket[!train,]
dim(Smarket.2005)
Direction.2005 = Direction[!train]
- 252
- 9
glm.fits=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
data=Smarket,family=binomial,subset=train)
glm.probs=predict(glm.fits,Smarket.2005,type="response")
glm.pred = rep("Down",252)
glm.pred[glm.probs>.5] = "Up"
table(glm.pred,Direction.2005)
mean(glm.pred==Direction.2005)
mean(glm.pred!=Direction.2005)
Direction.2005
glm.pred Down Up
Down 77 97
Up 34 44
0.48015873015873
0.51984126984127
glm.fits=glm(Direction~Lag1+Lag2,data=Smarket,family=binomial,subset=train)
glm.probs=predict(glm.fits,Smarket.2005,type="response")
glm.pred=rep("Down",252)
glm.pred[glm.probs>.5]="Up"
table(glm.pred,Direction.2005)
mean(glm.pred==Direction.2005)
106/(106+76)
Direction.2005
glm.pred Down Up
Down 35 35
Up 76 106
0.55952380952381
0.582417582417582
predict(glm.fits,newdata=data.frame(Lag1=c(1.2,1.5),
Lag2=c(1.1,-0.8)),type="response")
- 1
- 0.479146239171912
- 2
- 0.496093872956532
Linear Discriminant Analysis#
library(MASS)
lda.fit = lda(Direction~Lag1+Lag2,data=Smarket,subset=train)
lda.fit
plot(lda.fit)
Call:
lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
Prior probabilities of groups:
Down Up
0.491984 0.508016
Group means:
Lag1 Lag2
Down 0.04279022 0.03389409
Up -0.03954635 -0.03132544
Coefficients of linear discriminants:
LD1
Lag1 -0.6420190
Lag2 -0.5135293
lda.pred = predict(lda.fit, Smarket.2005)
names(lda.pred)
- 'class'
- 'posterior'
- 'x'
lda.class = lda.pred$class
table(lda.class,Direction.2005)
mean(lda.class==Direction.2005)
Direction.2005
lda.class Down Up
Down 35 35
Up 76 106
0.55952380952381
sum(lda.pred$posterior[,1]>=.5)
sum(lda.pred$posterior[,1]<.5)
70
182
lda.pred$posterior[1:20,1]
lda.class[1:20]
- 999
- 0.490179249818258
- 1000
- 0.479218499099683
- 1001
- 0.466818479852065
- 1002
- 0.474001069455248
- 1003
- 0.492787663967445
- 1004
- 0.493856154997504
- 1005
- 0.495101564646223
- 1006
- 0.487286099421815
- 1007
- 0.490701348960405
- 1008
- 0.484402624071869
- 1009
- 0.490696276120968
- 1010
- 0.511998846261919
- 1011
- 0.489515226936648
- 1012
- 0.470676122211879
- 1013
- 0.474459285611829
- 1014
- 0.479958339148108
- 1015
- 0.493577529465861
- 1016
- 0.503089377118306
- 1017
- 0.497880612141404
- 1018
- 0.488633086516518
- Up
- Up
- Up
- Up
- Up
- Up
- Up
- Up
- Up
- Up
- Up
- Down
- Up
- Up
- Up
- Up
- Up
- Down
- Up
- Up
Levels:
- 'Down'
- 'Up'
sum(lda.pred$posterior[,1]>.9)
0
Quadratic Discriminant Analysis#
qda.fit = qda(Direction~Lag1+Lag2,data=Smarket,subset=train)
qda.fit
Call:
qda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
Prior probabilities of groups:
Down Up
0.491984 0.508016
Group means:
Lag1 Lag2
Down 0.04279022 0.03389409
Up -0.03954635 -0.03132544
qda.class = predict(qda.fit, Smarket.2005)$class
table(qda.class, Direction.2005)
mean(qda.class==Direction.2005)
Direction.2005
qda.class Down Up
Down 30 20
Up 81 121
0.599206349206349
K-Nearest Neighbors#
library(class)
train.X = cbind(Lag1,Lag2)[train,]
test.X=cbind(Lag1,Lag2)[!train,]
train.Direction=Direction[train]
set.seed(1)
knn.pred=knn(train.X, test.X,train.Direction, k=1)
table(knn.pred, Direction.2005)
(83+43)/252
Direction.2005
knn.pred Down Up
Down 43 58
Up 68 83
0.5
knn.pred=knn(train.X, test.X, train.Direction, k=3)
table(knn.pred,Direction.2005)
mean(knn.pred==Direction.2005)
Direction.2005
knn.pred Down Up
Down 48 54
Up 63 87
0.535714285714286
An Application to Caravan Insurance Data#
dim(Caravan)
attach(Caravan)
summary(Purchase)
348/5822
- 5822
- 86
- No
- 5474
- Yes
- 348
0.0597732737890759
standardized.X = scale(Caravan[,-86])
var(Caravan[,1])
var(Caravan[,2])
var(standardized.X[,1])
var(standardized.X[,2])
165.037847395189
0.164707781931954
1
1
test = 1:1000
train.X = standardized.X[-test,]
test.X = standardized.X[test,]
train.Y = Purchase[-test]
test.Y = Purchase[test]
set.seed(1)
knn.pred=knn(train.X, test.X, train.Y, k=1)
mean(test.Y!=knn.pred)
mean(test.Y!="No")
0.118
0.059
table(knn.pred, test.Y)
9/(68+9)
test.Y
knn.pred No Yes
No 873 50
Yes 68 9
0.116883116883117
knn.pred = knn(train.X, test.X, train.Y, k=3)
table(knn.pred,test.Y)
5/26
knn.pred=knn(train.X, test.X, train.Y, k=5)
table(knn.pred,test.Y)
4/15
test.Y
knn.pred No Yes
No 920 54
Yes 21 5
0.192307692307692
test.Y
knn.pred No Yes
No 930 55
Yes 11 4
0.266666666666667
glm.fits=glm(Purchase~.,data=Caravan,family=binomial,
subset=-test)
glm.probs=predict(glm.fits, Caravan[test,], type="response")
glm.pred=rep("No",1000)
glm.pred[glm.probs>.5]="Yes"
table(glm.pred,test.Y)
glm.pred=rep("No",1000)
glm.pred[glm.probs>.25]="Yes"
table(glm.pred,test.Y)
11/(22+11)
Warning message:
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
test.Y
glm.pred No Yes
No 934 59
Yes 7 0
test.Y
glm.pred No Yes
No 919 48
Yes 22 11
0.333333333333333