\documentclass{article}
\usepackage[pdftex]{graphicx}
\usepackage{amsfonts}
\usepackage{amsmath, amsthm, amssymb}
\usepackage{moreverb}
\usepackage{pdfpages}
\usepackage{multirow}

\title{CS 224w: Problem Set 3}
\author{Tony Hyun Kim}
\setlength{\parindent}{0pt}
\setlength\parskip{0.1in}
\setlength\topmargin{0in}
\setlength\headheight{0in}
\setlength\headsep{0in}
\setlength\textheight{8.2in}
\setlength\textwidth{6.5in}
\setlength\oddsidemargin{0in}
\setlength\evensidemargin{0in}

\pdfpagewidth 8.5in
\pdfpageheight 11in

% Custom commands
\newcommand{\vectornorm}[1]{\left|\left|#1\right|\right|}

\begin{document}

\maketitle

\section{Influence maximization}

\subsection{Example where $f(S_2) < f(T)$\label{subsec:p1a}}

\begin{figure}[b!]
	\begin{center}
		\includegraphics[width=0.4\textwidth]{p1a.pdf}
	\end{center}
	\caption{Example where $f(S_2) < f(T)$.\label{fig:p1a}}
\end{figure}

Consider the scenario depicted in Fig.~\ref{fig:p1a}: only the nodes $x_1$, $x_2$ and $x_3$ have nontrivial influence sets, indicated by the directed edges.

The optimal set of size $i=2$ is $T = \left\{x_1, x_3\right\}$, which has a combined influence set of $8$ elements (counting $x_1$ and $x_3$). On the other hand, the greedy algorithm will choose $x_2$ in the first round, and yield $T_2 = \left\{x_2, x_1\right\}$ in the second round, which has an influence set of $7$ elements. (Note: $x_1$ in $T_2$ could be replaced by $x_3$).

\subsection{Example where $f(S_3) \leq 0.8 f(T)$}

Consider the scenario depicted in Fig.~\ref{fig:p1b} (which is a generalized form of the case in Section~\ref{subsec:p1a}). Each of the nodes $x_1$, $x_2$, $x_3$ has an influence set of $n+1$ elements (counting itself), whereas node $x_4$ has an influence set of $3k+1$.

\begin{figure}[t]
	\begin{center}
		\includegraphics[width=0.75\textwidth]{p1b.pdf}
	\end{center}
	\caption{Example where $f(S_3) < 0.8 \cdot f(T)$.\label{fig:p1b}}
\end{figure}

Suppose that $3k > n$. In this case, the first step of the greedy algorithm will choose $x_4$ since $|X_4| = 3k+1 > |X_{1,2,3}| = n+1$. In the next two steps, the greedy algorithm will choose two of $\left\{x_1, x_2, x_3\right\}$ at random. The resulting influence score of the greedy algorithm is $f(S_3) = 2n + k + 3$.

Consider the set $T = \left\{x_1, x_2, x_3\right\}$. The influence score will be $f(T) = 3n + 3$, and the set $T$ will be optimum as long as $k < n$. So, in the case of Fig.~\ref{fig:p1b}, greedy will obtain the suboptimal solution as long as $k < n < 3k$.

Next, we choose $n$, $k$ such that $f(S_3) \leq 0.8 \cdot f(T)$. The inequality can be stated as follows:
\begin{equation}
	0.8 \geq \frac{f(S_3)}{f(T)} = \frac{2n+k+3}{3n+3},
\end{equation}
which, after little manipulation, yields:
\begin{equation}
	n \geq \frac{5k+3}{2}\label{eqn:p2b_ineq}.
\end{equation}

The above inequality can be readily satisfied. For example, let $k=10$, $n=27$. Since $n<3k$, the greedy solution will be suboptimal as described above. Furthermore, Eq.~\ref{eqn:p2b_ineq} becomes $27 \geq 26.5$, so that $f(S_3) \leq 0.8 \cdot f(T)$.

\subsection{Sufficient property for $f(S_i) = f(T)$\label{subsec:optgreedy}}

A trivial sufficient condition to achieve $f(S_i) = f(T)$ is to have the top $i$ largest influence sets to be disjoint. In this case, we do not have to worry about possible ``interactions'' between a node under consideration and the nodes already in our solution set, so the optimal set can be constructed by considering nodes in isolation as in the greedy algorithm.

\subsection{Bad data-dependent bound}

Here is one pathological example where the data-dependent bound will perform arbitrarily badly. The intuition is that the data-dependent bound relies on the property of diminishing returns; so, I construct an example where the diminishing property is not present.

Consider a case where the influence sets of nodes $x_i$ are disjoint, as depicted in Fig.~\ref{fig:p1d}. Each node $x_i$ has an influence set $X_i$ of size $L + 1$. (The downstream nodes in the influence set, \emph{i.e.} $X_i - \left\{x_i\right\}$, have trivial influence sets. Furthermore, the influence sets of distinct nodes $x_i$ are disjoint, \emph{i.e.} $X_i \cap X_j = \emptyset$ for $i \neq j$. Assume $|\left\{x_i\right\}| \gg k$.

\begin{figure}[t]
	\begin{center}
		\includegraphics[width=0.3\textwidth]{p1d.pdf}
	\end{center}
	\caption{A possible influence set structure that results in a bad data-dependent bound.\label{fig:p1d}}
\end{figure}

Consider the data-dependent bound $f(S_k) + \sum_{i=1}^k \delta_i - f(T)$ for some fixed $k$. Given disjoint influence sets, $S_k$ is some $k$-element subset of $\left\{x_i\right\}$ and is, in fact, the optimal influence set such that:
\begin{equation}
	f(S_k) + \sum_{i=1}^k \delta_i - f(T) = \sum_{i=1}^k \delta_i.
\end{equation}

Now, since we assume that $|\left\{x_i\right\}| \gg k$, there exists at least $k$ elements of $\left\{x_i\right\}$ that are not in $S_k$. Since the influence sets of $x_i$ are disjoint, we have:
\begin{equation}
	f(S_k) + \sum_{i=1}^k \delta_i - f(T) = \sum_{i=1}^k \delta_i = k\cdot(L+1),
\end{equation}
which increases with $k$ without bound.

\section{Empirical power laws}

\subsection{Complementary CDF}

First, we find the CDF $C(x)$:
\begin{eqnarray}
	C(x) &=& \int_{x_\mathrm{min}}^x P(x') dx' = \int_{x_\mathrm{min}}^x \frac{\alpha-1}{x_\mathrm{min}}\left(\frac{x'}{x_\mathrm{min}}\right)^{-\alpha} dx',\\
		   &=&  1 - \left(\frac{x}{x_\mathrm{min}}\right)^{-(\alpha-1)},
\end{eqnarray}
when $x \geq x_\mathrm{min}$, otherwise $C(x) = 0$. (We assume $\alpha \neq 1$. In that case, the stated pdf is identically zero for all $x$.)

The complementary CDF (CCDF) is given by:
\begin{equation}
	\bar{C}(x) = 1 - C(x) = \left(\frac{x}{x_\mathrm{min}}\right)^{-(\alpha-1)},
\end{equation}
when $x \geq x_\mathrm{min}$, otherwise $\bar{C}(x) = 1$.

\begin{figure}[t]
	\begin{center}
		\includegraphics[width=0.75\textwidth]{p2b.pdf}
	\end{center}
	\caption{Simulation (blue dots) of the probability distribution $P(x) = \frac{\alpha-1}{x_\mathrm{min}}\left(\frac{x}{x_\mathrm{min}}\right)^{-\alpha}$ (red dashed line).\label{fig:p2b}}
\end{figure}

\subsection{Simulated dataset with $\alpha=2$ and $x_\mathrm{min}=1$}

Since we have the explicit form of the CDF, we can generate samples from $P(x)$ by applying the inverse transform sampling method: the random variable $X = C^{-1}(U)$ will be distributed according to $F(x)$, when $C^{-1}$ is the inverse CDF and $U$ is a uniform random variable.

Using this procedure, I simulated $10^5$ draws from $P(x)$ as shown in Fig.~\ref{fig:p2b}. I simply omitted the points $x$ for which the estimate of $P(X=x)$ is zero. Practically, including such points in the log-log plot would mess up the vertical scale of the plot.

\subsection{Least squares fit for $\alpha$ on PDF\label{subsec:fithist}}

When I use all (nonzero) histogram data to find the exponent, I find $\hat{\alpha}_\mathrm{hist} \approx -1.1$. However, as shown in Fig.~\ref{fig:p2c}(a), this is a terrible fit for the data. The least squares routine is being fooled by the noisy data points in the tail of the data.

We can get a better visual fit through the data by omitting the noisy heavy-tail portion of the simulation ($x > 10^2$) in the least squares fit. (I chose the cutoff to be $x = 10^2$ arbitrarily.) The result is shown in Fig.~\ref{fig:p2c}(b), which yields $\hat{\alpha}_\mathrm{hist} = -1.95$, which is much closer to the simulation parameter.

\begin{figure}[t]
	\begin{center}
		\includegraphics[width=0.75\textwidth]{p2c.pdf}
	\end{center}
	\caption{Simulation (blue dots) of the probability distribution $P(x) = \frac{\alpha-1}{x_\mathrm{min}}\left(\frac{x}{x_\mathrm{min}}\right)^{-\alpha}$ (red dashed line).\label{fig:p2c}}
\end{figure}

\subsection{Least squares fit for $\alpha$ on CCDF\label{subsec:fitccdf}}

Based on the loglog fit to the simulated CDF, I find $\hat{\alpha}_\mathrm{CDF} = 1.999$.

\subsection{MLE estimate of $\alpha$\label{subsec:mle}}

The MLE estimate of $\alpha$ is given by:
\begin{equation}
	\hat{\alpha}_\mathrm{MLE} = 1 + n\left[\sum_{i=1}^n\log\left(\frac{d_i}{x_\mathrm{min}}\right)\right]^{-1}
	\label{eqn:mlealpha}
\end{equation}
which, when applied to my dataset, revealed $\hat{\alpha}_\mathrm{MLE} = 2.002$.

\subsection{Comparison of estimation methods}

I ran the simulation and fitting procedures $100$ times. The results are as follows. Note that the standard deviation, rather than the variance, is cited:
\begin{itemize}
	\item Fitting the histogram (Section~\ref{subsec:fithist}, with no arbitrary cutoff): $\hat{\alpha} \approx 1.0884 \pm 0.0287$.
	\item Fitting the CCDF (Section~\ref{subsec:fitccdf}): $\hat{\alpha} \approx 1.9902 \pm 0.0296$.
	\item Computing the MLE (Section~\ref{subsec:mle}): $\hat{\alpha} \approx 1.9994 \pm 0.0028$.
\end{itemize}

Clearly, the fitting of the entire histogram is highly susceptible to the noise in the long tail (Fig~\ref{fig:p2c}), and yields a wildly inaccurate result.

On the other hand, the means of the CCDF and the MLE method both yield good estimates of the true $\alpha$. However, it appears that the MLE is slightly more accurate, and is more consistent (smaller standard deviation). The performance of the MLE is intuitively sensible, since the MLE model presumes a correct model of the distribution, whereas the fitting methods are ``not aware'' of the underlying distribution model.

\includepdf[pages=-,nup=1x2,landscape=true,]{p2b_code.pdf}

\section{Combination of exponentials for generating power laws}

I wrote a simple Python program to generate the word counts of the two source files. I then used Matlab to plot and analyze the data.

\subsection{Empirical power laws in word-frequency distributions}

\begin{figure}[t!]
	\begin{center}
		\includegraphics[width=0.6\textwidth]{p3a.pdf}
	\end{center}
	\caption{Word frequency distribution from Moby Dick and Don Quijote sources. The MLE fit (red dashed line) uses the formulas from the continuous power-law distribution.\label{fig:p3a}}
\end{figure}

The empirical distributions are shown in Fig.~\ref{fig:p3a}. I also plotted the MLE power-law distribution (dashed lines) assuming $x_\mathrm{min}=1$, since words with this minimum frequency are present in the empirical distribution. Note that MLE estimate of $\alpha$ (Eq.~\ref{eqn:mlealpha}) was derived from a continuous power-law distribution whereas we are currently dealing with a discrete distribution. This results in the relatively poor performance of the MLE fit in describing the data.

On the other hand, in Fig.~\ref{fig:p3a_discrete}, I show the results of the MLE fit assuming a \emph{discrete} power-law distribution. I continued to use $x_\mathrm{min} = 1$ since such frequencies arise in the real data. On the other hand, I numerically calculated the necessary normalization coefficient at each value of $\alpha$ assuming a discrete distribution. It can be readily observed that the MLE fits assuming a discrete model performs significantly better than the continuous model.

The fitted parameters (using the discrete model) are:
\begin{itemize}
	\item Moby Dick: $x_\mathrm{min}=1$, and $\hat\alpha = 1.74$.
	\item Don Quijote: $x_\mathrm{min}=1$, and $\hat\alpha = 1.69$.
\end{itemize}

\begin{figure}[p!]
	\begin{center}
		\includegraphics[width=0.65\textwidth]{p3a_discrete.pdf}
	\end{center}
	\caption{Word frequency distribution from Moby Dick and Don Quijote sources. The parameter $x_\mathrm{min}$ was fixed at $1$, while the MLE curve was computed as a function of $\alpha$ with the normalization constant appropriate for a discrete power-law distribution. It can be seen that the MLE fit on the discrete model (black curve) performs better than the one based on the continuous model (red curve).\label{fig:p3a_discrete}}
\end{figure}

\includepdf[pages=-,nup=1x2,landscape=true,]{p3a_code.pdf}

\subsection{Theoretical power laws in monkey novels}

\subsubsection{Distinct words of length $y$}

Given $m$ letters, there are $m^y = e^{\log(m)y}$ distinct words of length $y$. (This is a permutation with replacement.)

\subsection{Relative frequency of a particular word of length $y$}

Given a particular word of length $y$, the monkey needs to hit the $y$-long sequence of letters and then terminate with a space. So, the relative frequency $x$ is given by:
\begin{equation}
	x = \left(\frac{1-q_s}{m}\right)^y \cdot q_s = q_s \cdot e^{\left(\log\frac{1-q_s}{m}\right) y}.
\end{equation}

\subsection{Probability of a word having relative frequency $x$}

\section{Exploring robustness of networks}

Initially, I considered using \texttt{SNAP.py} for generating the preferential attachment (PA) graph. However, since the \texttt{TUNGraph} does not have a method to return a random edge in the graph, I would have to maintain a separate edgelist outside of the graph object, which just does not feel right. So, I just used Matlab to generate the PA graph.

\subsection{Diameter robustness}

Fig.~\ref{fig:p4a} shows the diameter as a function of fraction of nodes removed under the failure (triangle) and attack (circle) deletion policies. Note that the graph diameter initially increases as nodes are removed (because potential paths between nodes are eliminated), but that the diameter decreases again once large amounts of nodes have been removed (as the graph becomes fragmented).

Analysis of failure vs. attack scenarios for each network:
\begin{itemize}
	\item $G_{nm}$ graph: In Fig.~\ref{fig:p4a}(a), it can be seen that the variation in graph diameter is essentially identical under the failure and attack scenarios. This makes sense since, in the random graph, the nodes are mostly homogeneous (\emph{i.e.} nodes possess roughly the same degree). In Fig.~\ref{fig:p4a}(b), increasing failure is shown to slowly increase the diameter of the graph.
	\item AS graph: In contrast to $G_{nm}$, the AS graph is highly susceptible to the attack scenario. In Fig.~\ref{fig:p4a}(a), it can be seen that the diameter reaches its maximum when only $2\%$ of the nodes are attacked (afterwards, the AS graph becomes fragmented). On the other hand, the AS graph has excellent random failure tolerance. In Fig.~\ref{fig:p4a}(b), it is shown that the diameter of the AS graph remains constant (at about $10$) even when $50\%$ of the graph is randomly removed!
	\item PA graph: Like the AS graph, the PA graph is susceptible to the attack policy. On the other hand, the PA graph does not appear as inhomogeneous as the AS graph, as $10\%$ of the graph needs to be attacked in order to fragment the AS graph (compared to only $2\%$ for the AS case).
\end{itemize}

Comparison of the behavior of the different networks under the two deletion policies. We can conceptually rank the three graphs according to homogeneity $\leftrightarrow$ inhomgeneity:
\begin{itemize}
	\item The random graph $G_{nm}$ is the most homogeneous, and is relatively robust against targeted attacks. On the other hand, the diameter slowly suffers as more and more nodes fail (Fig.~\ref{fig:p4a}(b)). 
	\item Based on the results of the attack simulation, the AS graph is the most inhomogeneous; that is, the AS graph has a few nodes that are crucially important for the connectivity. The AS graph is highly susceptible to attack (on the few important nodes) but very resilient against random failures (since most nodes do not contribute significantly to the connectivity of the graph).
	\item The PA graph is intermediate between the $G_{nm}$ and AS graphs.
\end{itemize}

\begin{figure}[p]
	\begin{center}
		\includegraphics[width=0.7\textwidth]{p4a.pdf}
	\end{center}
	\caption{The graph diameter as a function of the number of nodes removed. Two deletion policies are considered: ``Attack''-mode (circles) where nodes with highest degree are targeted first; ``Failure''-mode (triangles) where nodes are removed at random. [a] The variation in graph diameter in the initial $2\%$ removal. [b] The variation in graph diameter as up to $50\%$ of the nodes are removed.\label{fig:p4a}}
\end{figure}

\subsection{$|\mathrm{LCC}|$ robustness}

Fig.~\ref{fig:p4b} shows the size of the largest connected component ($|\mathrm{LCC}|$) as a function of fraction of nodes removed under failure (triangle) and attack (circle) deletion policies.

For each graph, the attack policy is more successful at decreasing the $|\mathrm{LCC}|$ than the failure policy. This makes intuitive sense, since nodes with large degree are likely to be part of the LCC.

Under the attack policy, it is seen that the $|\mathrm{LCC}|$ of the AS graph is diminished first, then the PA graph, and then the $G_{nm}$ graph. This ordering is consistent with the idea that the AS graph is the most inhomogeneous network of the three. In an inhomogeneous network, it is relatively easy to fragment the LCC by targeting the nodes with high degrees.

Interestingly, even under the random failure policy, the $G_{nm}$ graph appears to be the most robust according to the $|\mathrm{LCC}|$ metric. This is contrary to the diameter measure of Fig.~\ref{fig:p4a}(b) where the diameter of the AS graph is robust against random failures. This suggests that it is more difficult to disconnect nodes from the LCC in a homogeneous graph than in an inhomogeneous graph.

\begin{figure}[p]
	\begin{center}
		\includegraphics[width=0.9\textwidth]{p4b.pdf}
	\end{center}
	\caption{The size of the largest connected component as a function of the number of nodes removed. Two deletion policies are considered: ``Attack''-mode (circles) where nodes with highest degree are targeted first; ``Failure''-mode (triangles) where nodes are removed at random.\label{fig:p4b}}
\end{figure}

\includepdf[pages=-,nup=1x2,landscape=true,]{p4_gen.pdf}
\includepdf[pages=-,]{p4.pdf}
\includepdf[pages=-,]{p4_matlab.pdf}

\end{document}