\documentclass{article}
\usepackage[pdftex]{graphicx}
\usepackage{amsfonts}
\usepackage{amsmath, amsthm, amssymb}
\usepackage{moreverb}
\usepackage{pdfpages}
\title{CS 246: Problem Set 2}
\author{Tony Hyun Kim}
\setlength{\parindent}{0pt}
\setlength\parskip{0.1in}
\setlength\topmargin{0in}
\setlength\headheight{0in}
\setlength\headsep{0in}
\setlength\textheight{8.2in}
\setlength\textwidth{6.5in}
\setlength\oddsidemargin{0in}
\setlength\evensidemargin{0in}

\pdfpagewidth 8.5in
\pdfpageheight 11in

\newcommand{\vectornorm}[1]{\left|\left|#1\right|\right|}

\begin{document}

\maketitle

\section{Recommendation systems}

\subsection{Similarity Matrix}

We can consider the cosine similarity to be a dot product between two normalized vectors $u/\vectornorm{u}$ and $v/\vectornorm{v}$.

Note that the items are represented as the columns of $R$. The norm squared of each column is given by the diagonal entries of $Q$. Hence, the normalized version of $R$ (with respect to the items) may be written $R' = R\cdot Q^{-1/2}$.

Hence, the item similarity matrix $S_I$ can be found as:
\begin{equation*}
	S_I = R'^T\cdot R' = Q^{-1/2} \cdot(R^T R)\cdot Q^{-1/2},
\end{equation*}
where we have made use of the symmetry of $Q^{-1/2}$.

Likewise, the users are represented as the rows of $R$. The norm square of each row is given by the diagonal entries of $P$. The normalized version of $R$ (this time with respect to the users) is $R''=P^{-1/2}R$. Hence:
\begin{equation*}
	S_U = R''\cdot R''^T = P^{-1/2}\cdot(RR^T)\cdot P^{-1/2}.
\end{equation*}

\subsection{Recommendation matrix}

The collaborative filtering recommendation for user $u$ and item $s$ is defined as:
\begin{eqnarray*}
	\mathrm{User-user:}\qquad r_{u,s} &=& \sum_{x \in \mathrm{Users}} \cos(u,x)\cdot R(x,s),\\
	\mathrm{Item-item:}\qquad r_{u,s} &=& \sum_{x \in \mathrm{Items}} R(u,x) \cdot \cos(x,s).
\end{eqnarray*}

It is clear that the above definitions are equivalent to that of matrix multiplication. Therefore,
\begin{equation*}
	\Gamma_\mathrm{User-User} = S_\mathrm{U} \cdot R = P^{-1/2}\cdot(RR^T)\cdot P^{-1/2} \cdot R
\end{equation*}
and
\begin{equation*}
	\Gamma_\mathrm{Item-Item} = R \cdot S_\mathrm{I} = R \cdot Q^{-1/2} \cdot(R^T R)\cdot Q^{-1/2}.
\end{equation*}

\subsection{Graphical representation of the recommendation matrix}

The entries of the recommendation matrix $\Gamma_{u,s}$ represents a (weighted) sum over the indirect paths between user $u$ and item $s$ of the form:
\begin{equation*}
	u \leftrightarrow s' \leftrightarrow u' \leftrightarrow s
\end{equation*}
where $u$ and $u'$ are users, and $s$ and $s'$ are items.

To arrive at this interpretation, consider the expression for user-user collaborative filtering (the interpretation is the same for item-item CF, though the ``weights'' of each path will be different):
\begin{equation}
	r_{u,s} = \sum_{u' \in \mathrm{Users}} \cos(u,u')\cdot R(u',s).
	\label{eq:user-user}
\end{equation}

Fix $u$ and $s$. Then Eq.~\ref{eq:user-user} is a sum that involves terms of the form $\cos(u,u')\cdot R(u',s)$ where 
\begin{itemize}
	\item The latter term $R(u',s)$ is nonzero only when there is a direct connection between $u'$ and $s$.
	\item The cosine similarity $\cos(u,u')$ is nonzero when users $u$ and $u'$ share some items $s'$ in common.
\end{itemize}
Hence, the product is nonzero when there exists a path $u \leftrightarrow s' \leftrightarrow u' \leftrightarrow s$ between user $u$ and item $s$. Note also that Eq.~\ref{eq:user-user} also involves a term of the form $\cos(u,u)\cdot R(u,s) = R(u,s)$ which adds a value of $1$ to $r_{u,s}$ if there exists a direct path between user $u$ and item $s$.

\subsection{TV show collaborative filtering}

\subsubsection{Top $5$ user-user collaborative filtering results}

\begin{verbatim}
Rank: Title (Score)
1: FOX 28 News at 10pm (908.480)
2: Family Guy (861.176)
3: 2009 NCAA Basketball Tournament (827.601)
4: NBC 4 at Eleven (784.782)
5: Two and a Half Men (757.601)
\end{verbatim}

\subsubsection{Top $5$ movie-movie collaborative filtering results}

\begin{verbatim}
Rank: Title (Score)
1: FOX 28 News at 10pm (31.365)
2: Family Guy (30.001)
3: NBC 4 at Eleven (29.397)
4: 2009 NCAA Basketball Tournament (29.227)
5: Access Hollywood (28.971)
\end{verbatim}

\subsubsection{Precision in top-$k$ predictions for user-user and item-item collaborative filtering}

Fig.~\ref{fig:collabfilter} shows the graph of the precision at top-$k$. The collaborative filtering performance of some $40\%$ precision is above the baseline of $20\%$ that one would get by random guessing.

\begin{figure}[ht]
	\begin{center}
		\includegraphics[width=0.7\textwidth]{collabfilter_baseline.pdf}
	\end{center}
	\caption{Precision in top-$k$ predictions for Alex by user-user and item-item collaborative filtering. The CF performance exceeds the $21\%$-baseline that would be obtained by random guessing. Presumably, if we had implemented a richer rating system (\emph{e.g.} with different levels of ``like'', etc.) then we could improve on this performance.\label{fig:collabfilter}}
\end{figure} 

\section{Singular value decomposition}

\subsection{Finding the SVD using $AA^T$ or $A^T A$}

\subsubsection{Use $AA^T$ or $A^T A$?}

In this problem, the matrix $A$ is taken to be $100 \times 10000$. Hence, $AA^T$ is $100 \times 100$ while $A^T A$ is $10000 \times 10000$. So, we'll take $AA^T$ for sure!

\subsubsection{Computing $U$ and $V$}

Since $A = USV^T$, we have 
\begin{eqnarray*}
	A A^T &=& (USV^T)\cdot(VS^T U^T) = U (SS^T) U^T,\,\mathrm{and}\\
	A^T A &=& (VS^T U^T)\cdot(USV^T) = V (S^T S) V^T.
\end{eqnarray*}
Hence, $U$ and $V$ may be computed by diagonalizing $AA^T$ and $A^T A$ respectively.

\subsubsection{Prove that $A^T u_i = \lambda_i v_i$\label{subsubsec:vfromu}}

Let $x_i^{(m)}$ be the $i$-th standard basis vector in $\mathbb{R}^m$ and $x_i^{(n)}$ likewise in $\mathbb{R}^n$. We have
\begin{eqnarray*}
	A^T u_i &=& (VS^T U^T)\cdot u_i = VS^T (U^T\cdot u_i)\\
			&=& VS^T x_i^{(m)} = V (S^T x_i^{(m)})\\
			&=& V \lambda_i x_i^{(n)} = \lambda_i (V x_i^{(n)})\\
			&=& \lambda_i v_i
\end{eqnarray*}

\subsubsection{Prove that $U$ and $V$ may be computed by decomposing $AA^T$ only\label{subsubsec:uvfromaat}}

Suppose we have decomposed $AA^T$. We have then computed $U$ as well as the diagonal eigenvalue matrix $\Lambda$ corresponding to $AA^T$. 

Note that the singular values $\lambda_i$ in $S$ (which we may take to be positive without loss of generality) are related to the entries of $\Lambda$ as: $\lambda_i$ = $\sqrt{\Lambda_i}$. Hence, by decomposing $AA^T$, we have determined both $U$ and $S$.

Then, by applying the results of Section~\ref{subsubsec:vfromu}, we may compute the corresponding components $v_i$ from the known $u_i$'s.

Remark: the vectors $u_i$ and $v_i$ corresponding to $\lambda_i=0$ may be arbitrarily paired without affecting the SVD decomposition result.

\subsubsection{Prove that $U$ and $V$ may be computed by decomposing $A^T A$ only}

In Section~\ref{subsubsec:uvfromaat}, reorder the roles of $u_i$ and $v_i$ and replace $AA^T$ with $A^T A$. The proof is identical.

\subsection{Finding the SVD using $M$}

\subsubsection{Relationship between $M$ and $M^T$}

We have
\begin{equation*}
	M^T = \left[ \begin{array}{cc}
				  0 & A^T  \\
			      A & 0 \end{array} \right]^T 
		= \left[ \begin{array}{cc}
				  0 & (A)^T  \\
			      (A^T)^T & 0 \end{array} \right]
	    = \left[ \begin{array}{cc}
				  0 & A^T  \\
			      A & 0 \end{array} \right] = M.
\end{equation*}

\subsubsection{Eigenvectors of $M$}

Let $u_i$ and $v_i$ be corresponding columns in $U$ and $V$. Then, $u_i$ and $v_i$ satisfy the relation proven in Section~\ref{subsubsec:vfromu}: $A^T u_i = \lambda_i v_i$ (and likewise $A v_i = \lambda_i u_i$).

Consider $\left[\begin{array}{cc} v_i & -u_i\end{array}\right]^T$, we have
\begin{equation*}
	M \cdot \left[\begin{array}{c} v_i \\ -u_i\end{array}\right] = 
	\left[ \begin{array}{cc}
				  0 & A^T  \\
			      A & 0 \end{array} \right] \cdot 
	\left[\begin{array}{c} v_i \\ -u_i\end{array}\right]
	= \left[\begin{array}{c} -A^T u_i \\ A v_i\end{array}\right]
	= \left[\begin{array}{c} -\lambda_i v_i \\ \lambda_i u_i\end{array}\right]
	= -\lambda_i \left[\begin{array}{c} v_i \\ -u_i\end{array}\right]
\end{equation*}
and likewise
\begin{equation*}
	M \cdot \left[\begin{array}{c} v_i \\ u_i\end{array}\right] = 
	\left[ \begin{array}{cc}
				  0 & A^T  \\
			      A & 0 \end{array} \right] \cdot 
	\left[\begin{array}{c} v_i \\ u_i\end{array}\right]
	= \left[\begin{array}{c} A^T u_i \\ A v_i\end{array}\right]
	= \left[\begin{array}{c} \lambda_i v_i \\ \lambda_i u_i\end{array}\right]
	= \lambda_i \left[\begin{array}{c} v_i \\ u_i\end{array}\right].
\end{equation*}

Hence, the vectors $\left[\begin{array}{cc} v_i & -u_i\end{array}\right]^T$ and $\left[\begin{array}{cc} v_i & u_i\end{array}\right]^T$ are eigenvectors of $M$ with eigenvalues $-\lambda_i$ and $\lambda_i$ respectively.

\subsubsection{Prove that the eigenvalues are $-\lambda_i$ and $\lambda_i$}

The eigenvalues were derived in the previous section. Furthermore, we have shown that $M$ is symmetric and hence may be diagonalized. This way, we can obtain the SVD of $A$: the vectors $u_i$ and $v_i$ are parsed from the eigenvectors of $M$, and the singular values correspond to the positive eigenvalues of $M$.

\subsection{Document similarity using SVD}

The variation of the $r$-score as a function of the compression parameter $k$ is shown in Fig.~\ref{fig:rscore}. The Matlab code for computing the $r$-score is attached in the following page.

\begin{figure}[t]
	\begin{center}
		\includegraphics[width=0.7\textwidth]{docsimilarity_svd.pdf}
	\end{center}
	\caption{The $r$-score showing the mean similiarity of the ``baseball documents'' compared to the entire dataset as a function of SVD compression. Here, $k$ denotes the number of singular values preserved in the compression, and $k=497$ denotes the uncompressed matrix.\label{fig:rscore}}
\end{figure} 

\includepdf{svd-code.pdf}

\section{Theory of $k$-means}

\subsection{An identity involving the center of mass\label{subsec:centerofmass}}

Let $S$ be an arbitrary set of points with center of mass $c(S)=\left(\sum_{x\in S}x\right)/|S|$ and $z$ be an arbitrary point. We wish to show that
\begin{equation}
	\sum_{x\in S}\vectornorm{x-z}^2-\sum_{x\in S}\vectornorm{x-c(S)}^2 = |S|\cdot\vectornorm{c(S)-z}^2.
\end{equation}

Consider the LHS
\begin{eqnarray*}
	\sum_{x\in S}\vectornorm{x-z}^2-\sum_{x\in S}\vectornorm{x-c(S)}^2 &=&
		\sum_{x\in S}\left(\vectornorm{x-z}^2-\vectornorm{x-c(S)}^2\right)\\
		&=& \sum_{x\in S} x^2-2xz+z^2 -x^2+2xc(S) -c(S)^2\\
		&=& \sum_{x\in S} 2x\cdot (c(S)-z) + z^2 - c(S)^2\\
		&=& |S|\cdot\left[2c(S)\cdot(c(S)-z)+z^2-c(S)^2\right]\\
		&=& |S|\cdot\left[2c(S)^2-2c(S)\cdot z+z^2-c(S)^2\right]\\
		&=& |S|\cdot\left[c(S)^2-2c(S)\cdot z + z^2\right] = |S|\cdot\vectornorm{c(S)-z}^2.
\end{eqnarray*}

In particular, this result shows that the vector that minimizes the norm squared distance to a set of points $S$ is the center of mass $c(S)$.

\subsection{Prove that each iteration of $k$-means decreases the cost}

The cost function is 
\begin{equation}
	\phi(\mathcal{C},\left\{C_i\right\}) = \sum_{x \in \mathcal{X}} \min_{c\in \mathcal{C}} \vectornorm{x-c}^2,
	\label{eq:kmeanscost}
\end{equation}
which, for a given dataset $\mathcal{X}$, can be considered a function of the position of the centers as well as the assignment of the data to those centers.

We consider the two steps of each $k$-means iteration (denoted step $2$ and $3$ in the problem set).
\begin{itemize}
	\item In Step 2, the centers $c\in\mathcal{C}$ are considered to be fixed and the data $x\in\mathcal{X}$ is assigned to the nearest center. This is clearly optimizing Eq.~\ref{eq:kmeanscost} with respect to $\left\{C_i\right\}$ (with $\mathcal{C}$ fixed).
	\item In Step 3, the assignment of the data to the centers is fixed, and the centers are re-computed to be the center of mass of the corresponding group. This can be considered an optimization of Eq.~\ref{eq:kmeanscost} with respect to $\mathcal{C}$ (with $\left\{C_i\right\}$ fixed). Note that we can rewrite Eq.~\ref{eq:kmeanscost} as 
	\begin{equation*}
		\phi(\mathcal{C},\left\{C_i\right\}) = \sum_{x\in C_1}\vectornorm{x-c_1}^2+\sum_{x\in C_2}\vectornorm{x-c_2}^2+\cdots+\sum_{x\in C_k}\vectornorm{x-c_k}^2
	\end{equation*}
	where each term takes the form that we analyzed in Section~\ref{subsec:centerofmass}. Hence we are indeed optimizing $\phi$ with respect to $\mathcal{C}$ by assigning $c_i=c(C_i)$.
\end{itemize}

\subsection{Convergence of the cost function}

We have shown that with each iteration of the $k$-means algorithm, the cost function $\phi$ must decrease (or stay the same). Now, the cost function is clearly bounded from below: for instance, we have the trivial result $\phi \geq 0$. It then follows that the cost function must converge.

\subsection{Bad initialization example}

\begin{figure}[t]
	\begin{center}
		\includegraphics[width=0.8\textwidth]{badkmeans.pdf}
	\end{center}
	\caption{Demonstration of unfortunate $k$-means convergence due to bad initialization of centroids. (a) The dataset consists of $2n+1$ points with $n$ overlapping points at $x=0$, $n$ points at $x=2$, and a single point at $x=5$. We assume $n\gg1$. (b) Stationary clusters (red crossmarks) where the left cluster owns $2n$ points at $x=0,2$, and the right cluster is assigned the single point at $x=5$. (c) A better solution where the two clusters are assigned to $x=0$ and $x\approx2$.\label{fig:badkmeans}}
\end{figure} 

Here is my pathological case that, with a bad initialization, may lead to a stationary set of clusters with a sum squared error (SSE) cost that is $r$ times larger than the optimal cost. 

Consider $2n+1$ data points in one dimension, laid out as shown in Fig.~\ref{fig:badkmeans}(a). There are $n$ overlapping points at $x=0$, $n$ points at $x=2$, and a single point at $x=5$. We assume $n\gg 1$. Fig.~\ref{fig:badkmeans}(b) shows a terrible stationary cluster assignment (red marks) where the left cluster is assigned the $2n$ points at $x=0,2$ and the right cluster corresponds to the single point at $x=5$. The corresponding SSE is $2n$. On the other hand, Fig.~\ref{fig:badkmeans}(c) shows a much better assignment where the two clusters own each of the $n$ large clusters. (The right cluster is slightly displaced due to the point at $x=5$.) The SSE' is approximately $3^2=9$.

We now choose $n$ such that SSE/SSE'$=2n/9>r$. The result is proven.

\section{$k$-means on MapReduce}

\subsection{$k$-means implementation}

The two steps per iteration of $k$-means maps rather perfectly to MapReduce. Basically, the mapper reads a chunk of data and, for each datum $x$, compute the index $i$ of the nearest centroid $c_i$. The mapper then produces the key-value pair $(i,x)$. Subsequently, the reducer re-calculates the center-of-mass of the points assigned to a particular center.

I used \verb=DistributedCache= to initialize the list of centers for every mapper instance, and implemented a \verb=VectorWritable= representation for the data. The code is attached.

\includepdf[pages=-,nup=1x2,landscape=true]{kmeans_source.pdf}

\subsection{Computing the cost function}

As suggested, I compute the cost function within the same MapReduce job. I achieve this with the following trick: Each mapper, in addition to assigning the datum $x$ to a centroid index $i$ with the key-value pair $(i,x)$, will also emit the norm squared distance to a special reducer as follows $(-1,\vectornorm{x-c_i}^2)$.

A reducer instance can then determine whether it is responsible for the cost function by the flag key $=-1$. This specialized reducer will then write the accumulated score directly to HDFS. Note that for a single MapReduce run only a single reducer (and its repeated instantiations) will attempt the direct write to HDFS, making write contention even in a large-scale run unlikely.

\subsection{Comparison of the two initialization strategies}

Fig.~\ref{fig:kmeanscost} shows the decrease in the cost function $\phi$ value with each iteration of $k$-means. I have normalized the cost function by the maximum observed cost. It is clear that the random initialization strategy (\verb=c1.txt=) performs far worse than the initialization scheme where the centroids were separated as far as possible (\verb=c2.txt=).

After $10$ iterations, the cost function corresponding to \verb=c1.txt= has decreased to $74\%$ of its initial value. At the same iteration depth, the path corresponding to \verb=c2.txt= has decreased to $25\%$ of its initial value. Comparing \verb=c1= and \verb=c2= against each other, we find that after $10$ iterations, the random initialization yields a SSE error that is about $320\%$ higher than that of the distant initialization.

\begin{figure}[hb!]
	\begin{center}
		\includegraphics[width=0.65\textwidth]{kmeans_cost.pdf}
	\end{center}
	\caption{Decrease in the cost function $\phi$ (Eq.~\ref{eq:kmeanscost}) with iterations of $k$-means. Values have been normalized by the maximum observed cost. The maximum-distance initialization of the centroids clearly outperforms random initialization.\label{fig:kmeanscost}}
\end{figure} 

\end{document}