\documentclass{article}
\usepackage[pdftex]{graphicx}
\usepackage{amsfonts}
\usepackage{amsmath, amsthm, amssymb}
\usepackage{moreverb}
\usepackage{pdfpages}
\usepackage{multirow}
\title{CS 246: Problem Set 3}
\author{Tony Hyun Kim}
\setlength{\parindent}{0pt}
\setlength\parskip{0.1in}
\setlength\topmargin{0in}
\setlength\headheight{0in}
\setlength\headsep{0in}
\setlength\textheight{8.2in}
\setlength\textwidth{6.5in}
\setlength\oddsidemargin{0in}
\setlength\evensidemargin{0in}

\pdfpagewidth 8.5in
\pdfpageheight 11in

% THK: Commands accumulated over the problem sets...
\newcommand{\vectornorm}[1]{\left|\left|#1\right|\right|}

\begin{document}

\maketitle

\section{Similarity Ranking}

\subsection{Similarity by ``hand''}

I did the first iteration by hand (really), and realized that I was not sufficiently awake to carry out the computation for three more iterations without making a mistake. So, I wrote a short Matlab script (attached) to compute the similarity. The rows and columns of $S_A$ are ordered $\{$cameras, phones, printers$\}$; and the rows and columns of $S_B$ are ordered $\{$HP, Nokia, Kodak, Apple, Canon$\}$. Sim(camera,phone) and sim(camera,printer) correspond respectively to the two boldface entries of $S_A$ indicated below. Note that we expect sim(camera,printer)$=0$ since the printer-HP graph is disjoint from the remaining bipartite graph.

Iteration $1$:
\begin{equation*}
S_A^{(1)} = \left[ \begin{array}{ccc}
				$1$ & \mathbf{0.1333} & \mathbf{0} \\
				$0.1333$ & $1$ & $0$ \\
				$0$ & $0$ & $1$ \end{array} \right] 
\qquad\mathrm{and}\qquad 
S_B^{(1)} = \left[ \begin{array}{ccccc}
				$1$ & $0$ & $0$ & $0$ & $0$ \\
				$0$ & $1$ & $0.4000$ & $0.4000$ & $0.4000$ \\
				$0$ & $0.4000$ & $1$ & $0$ & $0.8000$ \\
				$0$ & $0.4000$ & $0$ & $1$ & $0$ \\				
				$0$ & $0.4000$ & $0.8000$ & $0$ & $1$ \end{array} \right]
\end{equation*}

Iteration $2$:
\begin{equation*}
S_A^{(2)} = \left[ \begin{array}{ccc}
				$1$ & \mathbf{0.2933} & \mathbf{0} \\
				$0.2933$ & $1$ & $0$ \\
				$0$ & $0$ & $1$ \end{array} \right] 
\qquad\mathrm{and}\qquad 
S_B^{(2)} = \left[ \begin{array}{ccccc}
				$1$ & $0$ & $0$ & $0$ & $0$ \\
				$0$ & $1$ & $0.4533$ & $0.4533$ & $0.4533$ \\
				$0$ & $0.4533$ & $1$ & $0.1067$ & $0.8000$ \\
				$0$ & $0.4533$ & $0.1067$ & $1$ & $0.1067$ \\				
				$0$ & $0.4533$ & $0.8000$ & $0.1067$ & $1$ \end{array} \right]
\end{equation*}

Iteration $3$:
\begin{equation*}
S_A^{(3)} = \left[ \begin{array}{ccc}
				$1$ & \mathbf{0.3431} & \mathbf{0} \\
				$0.3431$ & $1$ & $0$ \\
				$0$ & $0$ & $1$ \end{array} \right] 
\qquad\mathrm{and}\qquad 
S_B^{(3)} = \left[ \begin{array}{ccccc}
				$1$ & $0$ & $0$ & $0$ & $0$ \\
				$0$ & $1$ & $0.5173$ & $0.5173$ & $0.5173$ \\
				$0$ & $0.5173$ & $1$ & $0.2347$ & $0.8000$ \\
				$0$ & $0.5173$ & $0.2347$ & $1$ & $0.2347$ \\				
				$0$ & $0.5173$ & $0.8000$ & $0.2347$ & $1$ \end{array} \right]
\end{equation*}

\includepdf[pages=-,]{p1a.pdf}

\subsection{Using weights}

If we associate a weight $W_{(X,y)}$ with an edge $(X,y)$ in the bipartite graph, a natural way to re-write the update equations is 
\begin{equation*}
	s(X,Y) = \frac{C_1}{\sum_{i=1}^{|O(X)|} \sum_{j=1}^{|O(Y)|} W_{(X,O_i(X))}\cdot W_{(Y,O_j(Y))}} \sum_{i=1}^{|O(X)|} \sum_{j=1}^{|O(Y)|} W_{(X,O_i(X))} W_{(Y,O_j(Y))} \cdot s(O_i(X),O_j(Y))
\end{equation*}
and, similarly,
\begin{equation*}
	s(x,y) = \frac{C_2}{\sum_{i=1}^{|I(x)|} \sum_{j=1}^{|I(y)|} W_{(I_i(x),x)}\cdot W_{(I_j(y),y)}} \sum_{i=1}^{|I(x)|} \sum_{j=1}^{|I(y)|} W_{(I_i(x),x)} W_{(I_j(y),y)}\cdot s(I_i(x),I_j(y))
\end{equation*}

\subsection{Similarities in $K_{2,1}$ and $K_{2,2}$}

Similarity matrix of set $A$ in $K_{2,1}$ after $3$ iterations is
\begin{equation*}
S_A^{(3)} = \left[ \begin{array}{cc}
				$1$ & $0.8$ \\
				$0.8$ & $1$ \end{array} \right],
\end{equation*}
and in $K_{2,2}$ it is:
\begin{equation*}
S_A^{(3)} = \left[ \begin{array}{cc}
				$1$ & $0.6240$ \\
				$0.6240$ & $1$ \end{array} \right].
\end{equation*}
So, the similarity of the elements in $A$ is higher in the smaller bipartite graph $K_{2,1}$ than $K_{2,2}$, contrary to the ``intuitive'' expectation.

\subsection{Fix the problem}

%\subsection{Probabilistic interpretation of $\mathrm{sim}(x,y)$}

\section{Dense Communities in Networks}

\subsection{Running time\label{subsec:runningtime}}

\subsubsection{Prove that at any iteration of the algorithm, $|A(S)|\geq\frac{\epsilon}{1+\epsilon}|S|$}

We present a proof by contradiction. Suppose, at any iteration, we have $|A(S)|<\frac{\epsilon}{1+\epsilon}|S|$. Consider the complement set $S-A(S)$ consisting of elements $x\in S$ that satisfy $\mathrm{deg}_S(x) > 2(1+\epsilon)\rho(S)$. It follows that $S-A(S)$ contains at least $\frac{1}{1+\epsilon}|S|$ elements.

We then count the total degree of all elements in $S$:
\begin{equation*}
	\sum_{x \in S} \mathrm{deg}_S(x) \geq \sum_{x \in (S-A(S))} \mathrm{deg}_S(x) > \frac{1}{1+\epsilon}|S| \cdot 2(1+\epsilon) \rho(S) = 2|E(S)|,
\end{equation*}
where we have used the definition $\rho(S) = |E(S)|/|S|$. We find that the total degree of all nodes in $S$ strictly exceeds $2 |E(S)|$, which is impossible since each edge in $E(S)$ contributes exactly twice (at each endpoint) to the total degree $\sum_{x \in S} \mathrm{deg}_S(x)$. Hence, the initial assumption is false and we must have $|A(S)|\geq\frac{\epsilon}{1+\epsilon}|S|$.

\subsubsection{Prove that the algorithm terminates in at most $\log_{1+\epsilon}(n)$ iterations}

This is a simple extension of the previous result that $|A(S)|\geq\frac{\epsilon}{1+\epsilon}|S|$. Denote the size of the original graph by $|S_0|$. Since we remove $A(S)$ from $S$ at each iteration, the size of the set after the $n$-th step is bounded by
\begin{equation*}
	|S_n| < \frac{1}{1+\epsilon}|S_{n-1}| < \left(\frac{1}{1+\epsilon}\right)^n |S_0|,
\end{equation*}
which, for $n \geq \log_{1+\epsilon}(|S_0|)$, yields $|S_n|<1$. In other words, the set $S_n$ is empty and the algorithm must terminate since there's no more work to do!

\subsection{Quality of result}

Our aim to show that the density of the set returned by the algorithm is at most a factor $2(1+\epsilon)$ smaller than $\rho^*(G)$. (So, larger $\epsilon$ yields poorer results but converges faster. Typical story.)

\subsubsection{Step 1\label{subsubsec:step1}}

Let $S^*$ be the densest subgraph of $G$. For any $v \in S^*$, we wish to show that $\mathrm{deg}_{S^*}(v) \geq \rho^*(G)=\rho(S^*) = |E(S^*)|/|S^*|$. Again, we proceed by contradiction.

Suppose there exists a $v \in S^*$ such that $\deg_{S^*}(v) < \rho(S^*)$. Now consider the set $S'$ obtained by removing $v$ from $S$, \emph{i.e.} $S' = S - \left\{v\right\}$. We compute the density of the new set $S'$
\begin{equation*}
	\rho(S') = \frac{|E(S')|}{|S'|} = \frac{|E(S^*)|-\mathrm{deg}_{S^*}(v)}{|S^*|-1} > \frac{|E(S^*)|-\rho(S^*)}{|S^*|-1},
\end{equation*}
where we have made use of the fact that, by removing $v$ from $S$, we have deleted $\mathrm{deg}_{S^*}(v) <  \rho(S^*)$ edges from $E(S^*)$. We now substitute $\rho(S^*) = |E(S^*)|/|S^*|$ to yield
\begin{equation*}
	\rho(S') > \frac{|E(S^*)|-\frac{|E(S^*)|}{|S^*|}}{|S^*|-1} = \frac{\frac{|E(S^*)|}{|S^*|}\cdot(|S^*|-1)}{|S^*|-1} = \rho(S^*)
\end{equation*}
showing that $S'$ is a denser subgraph of $G$ than $S^*$, thereby contradicting the premise that $S^*$ is the densest subgraph. Hence, we must have $\deg_{S^*}(v) \geq \rho(S^*)$.

\subsubsection{Step 2\label{subsubsec:step2}}

Note that all nodes of the original graph (including those belonging to $S^*$) will occur in $A(S)$ at some iteration. We consider the first iteration in which there exists a node $v \in S^* \cap A(S)$.

Since $v \in S^*$, we have $\mathrm{deg}_{S^*}(v) \geq \rho(S^*)$ by the result of Section~\ref{subsubsec:step1}. And since $v \in A(S)$, we also have that $\mathrm{deg}_S(v) \leq 2(1+\epsilon)\rho(S)$.

Now, we also have that $S^* \subseteq S$ since we are considering the first iteration for which $S^* \cap A(S) \neq \emptyset$. It follows that $\mathrm{deg}_S(v) \geq \mathrm{deg}_{S^*}(v)$. Combining all of these inequalities, we find
\begin{equation*}
	2(1+\epsilon)\rho(S) \geq \mathrm{deg}_S(v) \geq \mathrm{deg}_{S^*}(v) \geq \rho(S^*).
\end{equation*}

\subsubsection{Step 3}

Finally, we wish to show that the density of the returned set $\tilde{S}$ satisfies
\begin{equation*}
	\rho(\tilde{S}) \geq \frac{1}{2(1+\epsilon)} \rho(S^*).
\end{equation*}

This is a trivial consequence of Section~\ref{subsubsec:step2}. We have already shown that there exists an iteration for which $\rho(S) \geq \frac{1}{2(1+\epsilon)} \rho(S^*)$. Since the algorithm keeps track of (and returns) the set $\tilde{S}$ with the highest observed density, it follows immediately that $\rho(\tilde{S}) \geq \frac{1}{2(1+\epsilon)} \rho(S^*)$

\subsection{Implementation of dense subgraph search}

%I first tried to implement the streaming file model in Matlab... didn't work too well (too slow). So, I implemented the dense subgraph search algorithm in C.

\begin{figure}[t]
	\begin{center}
		\includegraphics[width=0.6\textwidth]{p2c_i_ii.pdf}
	\end{center}
	\caption{(Left column) Number of iterations required to find a single dense subgraph as a function of $\epsilon$. The observed performance is significantly better than the theoretical bound. (Right) Variation in $\rho(S_i)$, $|E(S_i)|$, and $|S_i|$ over the algorithm execution, where $i$ is the iteration of the while loop.\label{fig:singlerun}}
\end{figure} 

Fig.~\ref{fig:singlerun}(a) shows the number of iterations to find a single dense subgraph, as a function of $\epsilon=\left\{0,0.1,0.5,1,2\right\}$. The algorithm seems to use significantly less steps than the theoretical bounds of Section~\ref{subsec:runningtime}. Fig.~\ref{fig:singlerun}(b) shows the evolution of $\rho(S_i)$, $|E(S_i)|$, and $|S_i|$ over the algorithm execution, where $i$ denotes the iteration of the main (while) loop.

I then repeated the algorithm to reveal $20$ communities in the graph. Fig.~\ref{fig:communities} shows the density $\rho(\tilde{S}_j)$ of the $j$-th community. I present the remaining results, namely $|E(\tilde{S}_j)|$ and $|\tilde{S}_j|$, in Table~\ref{tab:communities}, since they tend to vary somewhat wildly over $j$. This owes to the fact that the algorithm orders the results by density, rather than absolute edge or vertex count.

\begin{table}[t]
\begin{center}
\begin{tabular}{|c|c|c|c|}
\hline
Index $j$ & $\rho(\tilde{S}_j)$ & $|\tilde{S}_j|$ & $|E(\tilde{S}_j)|$ \\
\hline
$0$ & $134.35$ & $1286$ & $345560$ \\
$1$ & $87.53$ & $238$ & $41666$ \\
$2$ & $52.75$ & $3989$ & $420860$ \\
$3$ & $32.59$ & $70$ & $4561$ \\
$4$ & $26.12$ & $45975$ & $2401632$ \\
$5$ & $17.62$ & $110$ & $3875$ \\
$6$ & $14.28$ & $60$ & $1714$ \\
$7$ & $10.52$ & $196$ & $4122$ \\
$8$ & $9.62$ & $21$ & $403$ \\
$9$ & $10.39$ & $23$ & $478$ \\
$10$ & $9.17$ & $812$ & $14898$ \\
$11$ & $8.28$ & $383$ & $6346$ \\
$12$ & $8.85$ & $39$ & $690$ \\
$13$ & $8.12$ & $164398$ & $2670910$ \\
$14$ & $5.46$ & $24$ & $262$ \\
$15$ & $4.50$ & $20$ & $180$ \\
$16$ & $4.30$ & $10$ & $86$ \\
$17$ & $3.86$ & $63$ & $486$ \\
$18$ & $3.37$ & $106$ & $714$ \\
$19$ & $3.03$ & $293$ & $1778$ \\
\hline
\end{tabular}
\end{center}
\caption{Properties of the top $20$ dense communities (as identified by repeated application of our dense subgraph search algorithm) in `livejournal-undirected.txt'. \label{tab:communities}}
\end{table}

\begin{figure}[t]
	\begin{center}
		\includegraphics[width=0.6\textwidth]{p2c_communities.pdf}
	\end{center}
	\caption{The density of the $j$-th community returned by repeated execution of the dense graph search algorithm. The density apparently trails off after the first $5$ or so communities. \label{fig:communities}}
\end{figure} 

\includepdf[pages=-,nup=1x2,landscape=true]{p2c.pdf}

\section{PageRank Computation}

\subsection{$L_1$ convergence\label{subsec:pagerank_conv}}

We telescope the expression
\begin{eqnarray*}
	\pi-\pi^{(k)} &=& \left(\frac{\epsilon}{n}\textbf{1}^T+(1-\epsilon)\pi P\right) - \left(\frac{\epsilon}{n}\textbf{1}^T+(1-\epsilon)\pi^{(k-1)} P\right),\\
				  &=& (1-\epsilon)(\pi-\pi^{(k-1)}) P,\\
				  &=& (1-\epsilon)^2(\pi-\pi^{(k-2)}) P^2,\\
				  &=& \cdots\\
				  &=& (1-\epsilon)^k(\pi-\pi^{(0)})P^k.
\end{eqnarray*}

Recall now that the matrix $P$ is row-stochastic, which implies that $\pi P^k$ and $\pi^{(0)} P^k$ are discrete probability distribution vectors. The largest $L_1$-distance between two discrete probability vectors is $2$, which occurs when the two vectors encode unit probabilities for two distinct outcomes. Thus,
\begin{equation}
	\vectornorm{\pi-\pi^{(k)}}_1 \leq 2 \cdot (1-\epsilon)^k.
	\label{eq:pagerank_conv}
\end{equation}

\subsection{Running time of Power Iteration}

First, we can rearrange Eq.~\ref{eq:pagerank_conv} to find that, in order to bound the $L_1$ norm $\vectornorm{\pi-\pi^{(k)}}_1$ at some fixed $c<1$, we have to perform 
\begin{equation*}
	k > \frac{|\log(c/2)|}{\log(1/(1-\epsilon))}
\end{equation*}
iterations of Power Iteration. (Note that for $c<1$, the quantity $\log(c/2)$ is negative. The above equation has been configured so that both the numerator and denominator are positive.)

Next, we consider the update equation in Power Iteration:
\begin{equation*}
	\pi^{(i)} = \frac{\epsilon}{n}\textbf{1}^T + (1-\epsilon)\pi^{(i-1)}P.
\end{equation*}
This computation can be performed as follows:
\begin{enumerate}
	\item Begin with an $n$-entry array $\pi^{(i)}$, where all entries are initialized to the value $\epsilon/n$. This initialization is assumed to be free.
	\item We perform the vector-matrix multiplication $(1-\epsilon)\pi^{(i-1)}P$. In the typical PageRank context, the (extremely large) $n\times n$ matrix $P$ is sparse and has exactly $m$ nonzero entries. Hence, this vector-matrix multiplication involves exactly $m$ multiplications (and subsequent additions to the array $\pi^{(i)}$).
\end{enumerate}

Hence, the overall time complexity is $O(km) = O(\frac{m}{\log(1/(1-\epsilon))})$.

\subsection{Algorithm MC1}

\subsubsection{Expectation\label{subsubsec:expectation}}

Let us begin by finding a formal expression for the stationary PageRank distribution $\pi$. From the results of Section~\ref{subsec:pagerank_conv}, we know that Power Iteration initialized at $\pi^{(0)}$ converges to the stationary distribution. We write the evolution of $\pi^{(i)}$ explicitly:
\begin{eqnarray*}
	\pi^{(0)} &=& \frac{1}{n}\textbf{1}^T\\
	\pi^{(1)} &=& \epsilon\frac{1}{n}\textbf{1}^T+(1-\epsilon)\cdot\pi^{(0)}\cdot P\\
			  &=& \epsilon\frac{1}{n}\textbf{1}^T+(1-\epsilon)\cdot\frac{1}{n}\textbf{1}^T\cdot P\\
	\pi^{(2)} &=& \epsilon\frac{1}{n}\textbf{1}^T+(1-\epsilon)\cdot\pi^{(1)}\cdot P\\
			  &=& \epsilon\frac{1}{n}\textbf{1}^T+\epsilon(1-\epsilon)\cdot\frac{1}{n}\textbf{1}^T\cdot P+(1-\epsilon)^2\cdot\frac{1}{n}\textbf{1}^T\cdot P^2
\end{eqnarray*}
and so on. By repeating the iteration rule repeatedly, we find the stationary distribution to be
\begin{equation}
	\pi = \frac{1}{n}\textbf{1}^T\cdot\left[\epsilon+\epsilon(1-\epsilon)P+\epsilon(1-\epsilon)^2 P^2+\epsilon(1-\epsilon)^3 P^3+\cdots\right].
	\label{eq:stationary_distr}
\end{equation}

Now we consider the MC1 algorithm. For a single random walker, we define the binary random variable $X_j$ where $X_j=1$ indicates that the walk terminated on node $j$. Note that the estimator $\hat{\pi}_{j,N}$ may be written in terms of $X_j$ as:
\begin{equation}
	\hat{\pi}_{j,N} = \frac{1}{N}\sum_{k=1}^N X_j^{(k)},
	\label{eq:mc1_estimator}
\end{equation}
where $k$ indexes over the different instatiations of the random walk. The variables $X_j^{(k)}$ are i.i.d.

We may compute the probability $E(X_j)$ by considering all possible paths that terminate on node $j$ of a given path length:
\begin{itemize}
	\item Path length 0: The walk is initialized at the node $j$, and terminates immediately. This occurs with probability $\epsilon\frac{1}{n}$.
	\item Path length 1: We take one step and then terminate, which gives the factor $(1-\epsilon)\epsilon$. We also need to sum over the prior probability of all nodes that can walk (in a single step) to node $j$. Hence, the overall probability is $\epsilon(1-\epsilon)(\frac{1}{n}\textbf{1}^T P)_j$, where we have made use of the fact that P implements a single step of our ``random surfer'' in the graph.
\end{itemize}
Similarly, we find that paths of length $t$ have probability $\epsilon(1-\epsilon)^{t}(\frac{1}{n}\textbf{1}^T P^{t})_j$ of terminating on node $j$ when initialized uniformly over the entire graph. Summing these disjoint probabilities, we find
\begin{equation*}
	E[X_j] = \epsilon(\frac{1}{n}\textbf{1}^T)_j+\epsilon(1-\epsilon)(\frac{1}{n}\textbf{1}^T P)_j +\epsilon(1-\epsilon)^2 (\frac{1}{n}\textbf{1}^T P^2)_j+\cdots,
\end{equation*}
which is precisely the $j$-th component of the stationary distribution $\pi$ in Eq.~\ref{eq:stationary_distr}. Since the variables $X_j^{(k)}$ are i.i.d., we obtain from Eq.~\ref{eq:mc1_estimator} that $E[\hat{\pi}_{j,N}]=E[X_j^{(1)}]=\pi_j$.

\subsubsection{Variance}

The random variable $X_j$ is a Bernoulli R.V. with probability $E(X_j)=\pi_j$ as shown in the previous section. Correspondingly, the variance of $X_j$ is $\pi_j(1-\pi_j)$.

Using the i.i.d. property of $X_j^{(k)}$ in Eq.~\ref{eq:mc1_estimator}, it follows that $\mathrm{Var}(\hat{\pi}_{j,N}) = N^{-1}\pi_j(1-\pi_j)$.

\subsection{Algorithm MC2}

Similarly to before, we use the binary variable $X_{i,j}=1$ to denote that the walk -- now initialized at node $i$ -- ultimately terminated at node $j$.

The MC2 estimator $\hat{\hat{\pi}}$ may then be written
\begin{equation}
	\hat{\hat{\pi}}_{j,N} = \frac{1}{N} \sum_{k=1}^R \sum_{i=1}^n X_{i,j}^{(k)} = \frac{1}{R} \sum_{k=1}^R \frac{1}{n} \sum_{i=1}^n X_{i,j}^{(k)}.
	\label{eq:mc2_estimator}
\end{equation}

Now consider the probability $E(X_{i,j})$. This quantity can be computed by the same strategy as in Section~\ref{subsubsec:expectation}, but where the prior distribution on the graph is not $\pi^{(0)}=\frac{1}{n}\textbf{1}^T$ but the deterministic distribution $\rho_i^T = [0\;\cdots\;0\;1\;0\;\cdots\;0]$ which is nonzero only at the $i$-th node. This substitution yields:
\begin{equation*}
	E[X_{i,j}] = \epsilon(\rho_i^T)_j+\epsilon(1-\epsilon)(\rho_i^T P)_j +\epsilon(1-\epsilon)^2 (\rho_i^T P^2)_j+\cdots,
\end{equation*}
which we then utilize to take the expectation of Eq.~\ref{eq:mc2_estimator}
\begin{eqnarray*}
	E[\hat{\hat{\pi}}_{j,N}] &=& \frac{1}{R} \sum_{k=1}^R \frac{1}{n} \sum_{i=1}^n E[X_{i,j}^{(k)}]\\
							&=& \frac{1}{R} \sum_{k=1}^R \frac{1}{n} \sum_{i=1}^n \epsilon(\rho_i^T)_j+\epsilon(1-\epsilon)(\rho_i^T P)_j +\epsilon(1-\epsilon)^2 (\rho_i^T P^2)_j+\cdots\\
							&=& \frac{1}{R} \sum_{k=1}^R \frac{1}{n} \sum_{i=1}^n \left\{ \rho_i^T \cdot \left[\epsilon+\epsilon(1-\epsilon)P +\epsilon(1-\epsilon)^2 P^2 +\cdots \right] \right\}_j.
\end{eqnarray*}

Now, we recognize that the infinite series in the ``innermost-loop'' does not depend on the index $i$, and that the summation $\frac{1}{n}\sum_{i=1}^n \rho_i$ is equivalent to $\frac{1}{n}\textbf{1}^T$. This yields
\begin{eqnarray*}
	E[\hat{\hat{\pi}}_{j,N}] &=& \frac{1}{R} \sum_{k=1}^R \left\{ \frac{1}{n}\textbf{1}^T\cdot\left[\epsilon+\epsilon(1-\epsilon)P +\epsilon(1-\epsilon)^2 P^2 +\cdots \right] \right\}_j = \frac{1}{R} \sum_{k=1}^R \pi_j = \pi_j.
\end{eqnarray*}

So, MC2 algorithm also provides an unbiased estimate of the stationary PageRank distribution.

\subsection{Algorithm MC3}

The analysis is similar to that of MC2. Now let $\tilde{X}_{i,j}$ denote the number of times that a walk initialized at node $i$ visits nodes $j$ some time during the walk. We then have
\begin{equation*}
	E[\tilde{\pi}_j] = \frac{\epsilon}{nR} \cdot \sum_{k=1}^R \sum_{i=1}^n E[\tilde{X}_{i,j}]
\end{equation*}

The quantity $E[\tilde{X}_{i,j}]$ is given by
\begin{equation*}
	E[\tilde{X}_{i,j}] = (\rho_i^T)_j+(1-\epsilon)(\rho_i^T P)_j +(1-\epsilon)^2 (\rho_i^T P^2)_j+\cdots,
\end{equation*}
where $\rho_i$ was defined in our discussion of MC2. Note that the only difference between $E[X_{i,j}]$ of MC2 and $E[\tilde{X}_{i,j}]$ of MC3 is the omission of factor $\epsilon$ in the latter, which follows since we are no longer requiring the walk to terminate at node $j$.

We then have
\begin{eqnarray*}
	E[\tilde{\pi}_j] &=& \frac{\epsilon}{nR} \cdot \sum_{k=1}^R \sum_{i=1}^n E[\tilde{X}_{i,j}]\\
					&=& \frac{1}{R} \sum_{k=1}^R \frac{1}{n} \sum_{i=1}^n \epsilon(\rho_i^T)_j+\epsilon(1-\epsilon)(\rho_i^T P)_j +\epsilon(1-\epsilon)^2 (\rho_i^T P^2)_j+\cdots
\end{eqnarray*}
which was shown in our analysis of MC2 to be equivalent to $\pi_j$.

\subsection{Running time of MC3}

Given a termination probability $\epsilon$, the length $T$ of a single random walk is distributed geometrically as
\begin{equation*}
	p(T=t) = \epsilon(1-\epsilon)^t\qquad\mathrm{for}\;t=0,1,2,\cdots
\end{equation*}
whose expectation is $E(T)=1/\epsilon$.

Since we are instantiating $N=nR$ random walks, and each step of the walk takes a unit amount of time, the expected running time of MC3 is $O(nR/\epsilon)$.

\subsection{Numerical results}

See Table~\ref{tab:PRresults}. Indeed, for a fixed number of random walkers, MC3 provides the best performance against the true PageRank vector (estimated from Power Iteration). The comparison between MC2 and MC1, on the other hand, appears more subtle. I performed this problem in Matlab. The source code is attached.

\begin{table}[ht]
\begin{center}
\begin{tabular}{|c|c|c|c|c|c|}
\hline
Method 			   & Total elapsed time (ms) & \multicolumn{4}{|c|}{Average absolute error} \\
\hline
Power Iteration (40) & $3.1$	& Top $10$ & Top $30$ & Top $50$ & All \\
\hline
MC3 ($R=1$) & $17.4$ & $0.005$ & $0.004$ & $0.003$ & $0.003$\\
MC3 ($R=3$) & $49.0$ & $0.004$ & $0.003$ & $0.002$ & $0.002$\\
MC3 ($R=5$) & $80.0$ & $0.002$ & $0.001$ & $0.001$ & $0.001$\\
\hline
MC2 ($R=1$) & $14.9$ & $0.014$ & $0.011$ & $0.010$ & $0.008$\\
MC2 ($R=3$) & $41.2$ & $0.014$ & $0.008$ & $0.006$ & $0.005$\\
MC2 ($R=5$) & $64.6$ & $0.004$ & $0.004$ & $0.003$ & $0.003$\\
\hline
MC1 ($N=100$) & $14.5$ & $0.017$ & $0.011$ & $0.010$ & $0.008$\\
MC1 ($N=300$) & $43.0$ & $0.009$ & $0.007$ & $0.005$ & $0.004$\\
MC1 ($N=500$) & $66.8$ & $0.005$ & $0.004$ & $0.003$ & $0.003$\\
\hline
\end{tabular}
\end{center}
\caption{Comparison of the Power Iteration method for computing PageRank against Monte-Carlo methods.\label{tab:PRresults}}
\end{table}

\includepdf[pages=-,nup=2x2]{p3f.pdf}

\section{Latent Features for Recommendations}

\subsection{Update equations of stochastic descent}

The error (with regularization) is defined as
\begin{equation}
	E = \sum_{(u,i)\in\mathrm{Ratings}} (R_{iu}-q_i\cdot p_u^T)^2 + \lambda\left[\sum_u \vectornorm{p_u}^2 + \sum_i \vectornorm{q_i}^2\right]
\end{equation}
where $q_i$ is the $i$-th row of $Q$, and $p_u$ is the $u$-th row of $P$. Note: at the time of writing, there are some inconsistencies in the definitions as provided by the problem set (\emph{e.g.} is $p_u$ a row- or column-vector, $\epsilon_{ui}$ vs. $\epsilon_{iu}$, etc.). In these ambiguous cases, I will just ``do the right thing.''

\begin{itemize}
	\item The error for any pair $(i,u)$ is: $\epsilon_{iu} = R_{iu}-q_i\cdot p_u$.
	\item The stochastic update equation for $q_i$ is: $q_i \leftarrow q_i + \eta \cdot (\epsilon_{iu} p_u - \lambda q_i)$. (This is basically a derivative of the error $E$ \emph{with respect to a single example} with respect to $q_i$.)
	\item Likewise, the stochastic update equation for $p_u$ is: $p_u \leftarrow p_u + \eta \cdot (\epsilon_{iu} q_i - \lambda p_u)$.
\end{itemize}

\subsection{Stochastic latent features implementation}

In our dataset, there are $n=943$ users and $m=1682$ movies. Fig.~\ref{fig:learningrate} shows a number of different learning rates $\eta$ that I have tried. I find $\eta=0.01$ to be a fairly good candidate. (I've noticed that larger values for $\eta$ can often lead to divergence!)

\begin{figure}[t]
	\begin{center}
		\includegraphics[width=0.6\textwidth]{p4b_learningrate.pdf}
	\end{center}
	\caption{Decrease in the error function (with regularization) as a function of the learning rate $\eta$. For these runs, $k=20$ and $\lambda = 0.2$. It appears that $\eta=0.01$ is suitable for this particular dataset.\label{fig:learningrate}}
\end{figure} 

My Matlab implementation is attached on the following page.

\subsection{Training and test errors as function of latent factor dimension}

Fig.~\ref{fig:lfdim} shows the training and test errors as a function of latent factor dimension $k$, at different values of the regularization parameter $\lambda$.

The three valid statements are:
\begin{itemize}
	\item Regularization \textbf{decreases} the test error for $k\geq 5$.
	\item Regularization \textbf{increases} the training error for all $k$.
	\item Regularization \textbf{decreases} overfitting.
\end{itemize}

\begin{figure}[t]
	\begin{center}
		\includegraphics[width=0.9\textwidth]{p4c_overfitting.pdf}
	\end{center}
	\caption{Variation in the training and test errors as a function of latent factor dimension $k$, for different values of the regularization parameter $\lambda$. Regularization counteracts overfitting of the training set, and yields better generalization performance (\emph{i.e.} on the test set).\label{fig:lfdim}}
\end{figure} 

\includepdf[pages=-,nup=1x2,landscape=true]{p4b.pdf}

\end{document}