# sgd.jl
#
# Basic files for manipulating datasets, showing the images of the
# digits, and performing the stochastic subgradient method.
module SGD
using PyPlot;
# (Atest, btest, Atrain, btrain) = LoadDatasets(train_filename = "zip.train",
# test_filename = "zip.test")
#
# Loads the data stored in the given files, which should correspond to
# training and testing data, and returns them. The matrices A, Atest
# and Atrain, are the testing and training features,
# respectively. btest and btrain are the labels.
#
# The matrix Atrain is of size N-by-d, where N is the training data
# size, while btrain is of size N-by-1 and consists of integers
# corresponding to the label (0, 1, ..., 9) of the digit.
function LoadDatasets(train_filename = "zip.train",
test_filename = "zip.test")
# First, read the training data
X = readdlm(train_filename, ' ', Float64, '\n');
btrain = X[:, 1];
Atrain = X[:, 2:end];
btrain = convert(Vector{Int}, btrain);
X = readdlm(test_filename, ' ', Float64, '\n');
btest = X[:, 1];
Atest = X[:, 2:end];
btest = convert(Vector{Int}, btest);
return (Atest, btest, Atrain, btrain);
end
# DisplayDigit(digit::Array{Float64})
#
# Given a vector (length 256) representation of a digit, plots the
# digit.
function DisplayDigit(digit::Array{Float64})
figure();
@assert length(digit[:]) == 256
imshow(reshape(digit[:], 16, 16)', cmap="gray");
end
# UnitTestSVMSubgradient()
#
# Performs a simple test to see if your implementation of
# MulticlassSVMSubgradient() returns the correct answer.
function UnitTestSVMSubgradient()
X = [ 0.055775 -0.440695 -0.304648 -0.0393231 0.811698 -0.15932 -1.32431 0.844026 0.176501 0.48658 ;
-0.16965 1.11826 -1.22383 -0.563181 0.0350225 -0.415462 0.0743669 2.39206 1.43435 -0.470969 ;
1.13797 2.36466 -0.162278 1.21961 -0.578313 -0.549383 -0.59793 -3.51949 1.68211 -1.91986 ;
-0.403334 -1.00065 1.60359 0.0743512 0.238251 2.55472 0.324888 0.986377 -0.285374 -0.481205 ;
-0.613068 -0.0489537 -2.12072 0.578211 -1.12735 0.642994 1.03207 -0.70421 2.24787 0.593453 ];
a = [ 0.412236
-0.778695
-0.0453014
0.894726
-0.6513 ];
b = 5;
Gtrue = [ 0.0 0.0 0.412236 0.0 0.0 -0.412236 0.0 0.0 0.0 0.0;
0.0 0.0 -0.778695 0.0 0.0 0.778695 0.0 0.0 0.0 0.0;
0.0 0.0 -0.0453014 0.0 0.0 0.0453014 0.0 0.0 0.0 0.0;
0.0 0.0 0.894726 0.0 0.0 -0.894726 0.0 0.0 0.0 0.0;
0.0 0.0 -0.6513 0.0 0.0 0.6513 0.0 0.0 0.0 0.0;
];
G = MulticlassSVMSubgradient(X, a, b);
if (vecnorm(G - Gtrue) > 0)
println("PROBLEM: Euclidean distance between subgradients is ",
vecnorm(G - Gtrue), " should be 0");
else
println("Correct subgradient returned");
end
end
# inds = ClassifyAll(Adata::Matrix{Float64}, X::Matrix{Float64})
#
# Takes in an N-by-d data matrix Adata, where d is the dimension and N
# is the sample size, and a classifier X, which is of size d-by-K,
# where K is the number of classes.
#
# Returns a vector of length N consisting of the predicted digits in
# the classes.
function ClassifyAll(Adata::Matrix{Float64}, X::Matrix{Float64})
scores = Adata * X;
(maxvals, inds) = findmax(scores, 2);
for ii = 1:length(inds)
inds[ii] = ind2sub((size(Adata, 1), size(X, 2)), inds[ii])[2] - 1;
end
return inds;
end
# (Ktrain, Ktest) = GetKernelRepresentation(Atrain::Matrix{Float64},
# Atest::Matrix{Float64};
# subsample_proportion::Float64 = .1,
# tau::Float64 = 1.0)
#
# Kernelizes the raw data matrices in Atrain and Atest, returning
# kernel train and test matrices for use in classification schemes.
# The kernel function used is
#
# K(a1, a2) = exp(-norm(a1 - a2)^2 / (2 * tau^2)).
#
# The matrices Ktrain and Ktest are constructed as follows. The matrix
# Atrain is of size Ntrain-by-d, Atest is of size Ntest-by-d, where
# Ntrain/Ntest are the training and test set sizes and d is the
# dimension of the raw data.
#
# Chooses a random fraction subsample_proportion (yielding a total Nk
# indices) of the training data to use in constructing the kernel
# matrices, call the random indices chosen
#
# {i(1), i(2), ..., i(Nk)}
#
# Then Ktrain is an Ntrain-by-Nk matrix whose entries are
#
# Ktrain[i, j] = K(Atrain[i, :], Atrain[i(j), :])
#
# while Ktest is an Ntest-by-Nk matrix whose entries are
#
# Ktest[i, j] = K(Atest[i, :], Atrain[i(j), :]).
function GetKernelRepresentation(Atrain::Matrix{Float64},
Atest::Matrix{Float64};
subsample_proportion::Float64 = .1,
tau::Float64 = 1.0)
if (subsample_proportion <= 0 || subsample_proportion > 1)
error("Proportion for subsampling training data must be in (0, 1], was ",
subsample_proportion);
end
(Ntrain, dd) = size(Atrain);
(Ntest, dd) = size(Atest);
# Find number of random entries to kernelize and their indices
num_to_kernelize = ceil(Int64, subsample_proportion * Ntrain);
kernelized_inds = randperm(Ntrain);
kernelized_inds = kernelized_inds[1:num_to_kernelize];
# Compute the distance matrices for train
squared_trains = sum(Atrain.^2, 2);
inner_product_trains = Atrain * Atrain[kernelized_inds, :]';
squared_train_distances =
(repmat(squared_trains, 1, num_to_kernelize)
+ repmat(squared_trains[kernelized_inds]', Ntrain, 1)
- 2 * inner_product_trains);
# Compute the distance matrices for test
squared_tests = sum(Atest.^2, 2);
inner_product_train_test = Atest * Atrain[kernelized_inds, :]';
squared_test_distances =
(repmat(squared_tests, 1, num_to_kernelize)
+ repmat(squared_trains[kernelized_inds]', Ntest, 1)
- 2 * inner_product_train_test);
# Get the kernels themselves
Ktrain = exp(-squared_train_distances / (2 * tau^2));
Ktest = exp(-squared_test_distances / (2 * tau^2));
return (Ktrain, Ktest);
end
# G = MulticlassSVMSubgradient(X, a, b)
#
# Let y = b + 1, so that y is in {1, 2, ..., 10}. (This corrects the
# indexing for b.) Returns a subgradient of the objective
#
# max_{i != y} ( 1 + a' * (X[:, i] - X[:, y]) )_+
#
# the idea being that we would like to have the correct label be
# assigned score a' * X[:, y] >> a' * X[:, i] for any incorrect label
# i != y. The notation ( . )_+ means the positive part of its
# argument.
#
# The inputs are X, of size n-by-K, where K is the number of classes,
# a of size n, and b an integer in {0, 1, ..., 9}.
function MulticlassSVMSubgradient(X::Matrix{Float64},
a::Vector{Float64}, b::Int)
# IMPLEMENT THIS METHOD -- YOUR CODE HERE
G = zeros(size(X));
return G;
end
# l = MulticlassSVMLoss(X::Matrix{Float64},
# a::Vector{Float64}, b::Int)
#
# Let y = b + 1, so that y is in {1, 2, ..., 10}. (This corrects the
# indexing for b.) Returns the objective
#
# max_{i != y} ( 1 + a' * (X[:, i] - X[:, y]) )_+
function MulticlassSVMLoss(X::Matrix{Float64},
a::Vector{Float64}, b::Int)
# IMPLEMENT THIS METHOD -- YOUR CODE HERE
return Inf;
end
# (X, mean_X) = sgd(Atrain::Matrix{Float64}, btrain::Vector{Int64};
# maxiter::Int64 = 10000, init_stepsize::Float64 = 1.0,
# l2_radius::Float64 = Inf)
#
# Performs maxiter iterations of projected stochastic gradient descent
# on the data contained in the matrix Atrain, of size N-by-d, where N
# is the sample size and d is the dimension, and the label vector
# btrain of integers in {0, 1, ..., 9}. Returns two d-by-10
# classification matrices X and mean_X, where the first is the final
# point of SGD and the second is the mean of all the iterates of SGD.
#
# Each iteration consists of choosing a random index from N and the
# associated data point in A, taking a subgradient step for the
# multiclass SVM objective, and projecting onto the set
#
# {X : ||X||_{Fr} <= l2_radius}
#
# where ||.||_{Fr} is the Frobenius norm (vector Euclidean norm) on
# its argument.
#
# The stepsize is init_stepsize / sqrt(iteration).
function sgd(Atrain::Matrix{Float64}, btrain::Vector{Int64};
maxiter::Int64 = 10000, init_stepsize::Float64 = 1.0,
l2_radius::Float64 = Inf)
K = 10;
(NN, dd) = size(Atrain);
X = zeros(dd, K);
mean_X = zeros(dd, K);
# IMPLEMENT PROJECTED SGD HERE
for iter = 1:maxiter
end
# END YOUR CODE
return (X, mean_X);
end
# C = GenerateConfusionMatrix(Atest::Matrix{Float64}, btest::Vector{Int64},
# X::Matrix{Float64})
#
# Generates a matrix C whose entry
#
# C[i, j] = Fraction of times that class j was correct but class i
# was predicted.
#
# The diagonal of C is set to zero, but one may recover the fraction
# of times that class j was correct and also predicted as
#
# 1 - sum(C[:, j]).
#
# Also displays the resulting confusion matrix as a heat map.
function GenerateConfusionMatrix(Atest::Matrix{Float64}, btest::Vector{Int64},
X::Matrix{Float64})
b_preds = ClassifyAll(Atest, X);
confusions = zeros(10, 10);
Ntest = size(Atest, 1);
for ii = 1:Ntest
confusions[b_preds[ii] + 1, btest[ii] + 1] += 1;
end
for ii = 1:10
total_i = sum(confusions[:, ii]);
confusions[ii, ii] = 0;
confusions[:, ii] /= total_i;
end
figure();
imshow(confusions, interpolation="none");
xlabel("True label");
ylabel("Predicted label");
xticks(0:9);
yticks(0:9);
return confusions;
end
# RunExperiment()
#
# Loads data, kernelizes it, and performs Projected Stochastic
# Gradient Descent. Prints results (in terms of final test error as
# well as final objective error).
function RunExperiment()
srand(0);
println("Loading dataset");
(Atest, btest, Atrain, btrain) = LoadDatasets();
println("Kernelizing dataset");
(Ktrain, Ktest) = GetKernelRepresentation(Atrain, Atest,
subsample_proportion = .2,
tau = .5);
l2_radius_raw = 40.0;
M_raw = sqrt(mean(sum(Atrain.^2, 2)));
init_stepsize_raw = l2_radius_raw / M_raw;
l2_radius_kernel = 60.0;
M_kernel = sqrt(mean(sum(Ktrain.^2, 2)));
init_stepsize_kernel = l2_radius_kernel / M_kernel;
println("Training SGD");
(X_raw, mean_X_raw) = sgd(Atrain, btrain, maxiter=40000,
init_stepsize = init_stepsize_raw,
l2_radius = l2_radius_raw);
println("Training Kernel SGD");
(X_kern, mean_X_kern) = sgd(Ktrain, btrain, maxiter=40000,
init_stepsize = init_stepsize_kernel,
l2_radius = l2_radius_kernel);
Ntest = length(btest);
println("** Test error rate for raw data **\n\t",
sum(ClassifyAll(Atest, X_raw) .!= btest) / Ntest,
" [last point]\n\t",
sum(ClassifyAll(Atest, mean_X_raw) .!= btest) / Ntest,
" [mean of iterates]");
println("** Test error rate for kernelized data **\n\t",
sum(ClassifyAll(Ktest, X_kern) .!= btest) / Ntest,
" [last point]\n\t",
sum(ClassifyAll(Ktest, mean_X_kern) .!= btest) / Ntest,
" [mean of iterates]");
GenerateConfusionMatrix(Atest, btest, X_raw);
title("Raw data confusion");
GenerateConfusionMatrix(Ktest, btest, mean_X_kern);
title("Kernelized data confusion");
end
end # module SGD