\documentclass[11pt]{article}

% \def\showanswer{1}

\usepackage{exercises}
\usepackage[numbers]{natbib}
\usepackage{xr}

\usebiggeometry
\bluehyperref

\newcommand{\stepsize}{\alpha}

\begin{document}

\setcounter{section}{10}

\title{ee364m Exercise Set \thesection\
  \ifdefined\showanswer
  Solutions
  \fi
}

\author{Due date: March 20, 11:59pm}
\date{}
\maketitle

In this exercise set, we will explore a few consequences of the model-based
minimization methods we have developed. This problem set is \emph{completely
optional}, but it provides (what I think) are some interesting additional
developments to the lecture's contents, including showing how we relate
stationarity to gradient mappings, even in problems beyond convexity.

\begin{question}[Models in convex optimization]
  Consider the minimization problem
  \begin{equation}
    \label{eqn:sum-objective}
    \begin{array}{ll} \minimize & f(x) \defeq h(x) + r(x) \\
      \subjectto & x \in \Omega
    \end{array}
  \end{equation}
  where $\Omega \subset \R^n$, $h$ has $L$-Lipschitz gradient on $\Omega$,
  and $r$ is convex.
  \begin{enumerate}[(a)]
  \item Show that the \emph{prox-linear} model
    \begin{equation*}
      f_x(y) \defeq h(x) + \<\nabla h(x), y - x\>
      + r(x)
    \end{equation*}
    is quadratically accurate for $f$, that is,
    $|f_x(y) - f(y)| \le \frac{L}{2} \ltwo{x - y}^2$, and
    $f_x(y) \le f(y)$ for all $y$.
  \item \label{item:one-step-prox-linear}
    Let $x\opt \in \Omega$ solve
    problem~\eqref{eqn:sum-objective}. Show that for
    an appropriate stepsize $\stepsize > 0$, the iteration
    \begin{equation*}
      x_{k + 1} \defeq \argmin_{x \in \Omega} \left\{f_{x_k}(x) +
      \frac{1}{2 \stepsize} \ltwo{x - x_k}^2 \right\}
    \end{equation*}
    satisfies
    \begin{equation*}
      f(x_{k + 1}) - f(x\opt) \le \frac{L \ltwo{x_1 - x\opt}^2}{2 k}.
    \end{equation*}
  \item Show how to solve the one-step update in
    part~\eqref{item:one-step-prox-linear} for
    $r(x) = \lambda \lone{x}$, where $\Omega \in \R^n$. (This method is
    called \emph{iterative shrinkage and thresholding}, or \emph{ISTA}.)
  \item Let $\Omega$ be the the collection of positive semidefinite
    matrices, and for $X \in \Omega$ let $r(X) = \lambda \tr X$ be the trace of
    $X$. Show how to solve the one-step update in
    part~\eqref{item:one-step-prox-linear}.
  \end{enumerate}
\end{question}


\begin{question}[A variational principle]
  Ekeland's variational principle relates near optimality of points to
  being (nearly) stationary in ways that Question~\ref{question:gradient-maps}
  explores.
  \begin{theorem}[Ekeland~\cite{Ekeland74}]
    \label{theorem:ekeland}
    Let $f : \R^n \to \R \cup \{\infty\}$ be proper (so that
    $\inf_x f(x) > -\infty$) and closed, and let
    $x_0$ satisfy $f(x_0) - \inf_x f(x) \le \epsilon$ for some $0 \le
    \epsilon < \infty$. Then for any $\delta > 0$ there exists a point
    $\what{x}$ satisfying
    \begin{enumerate}[i.]
    \item $\ltwos{\what{x} - x_0} \le \frac{\epsilon}{\delta}$
    \item $f(\what{x}) \le f(x_0)$
    \item $\what{x}$ uniquely minimizes
      $f(x) + \delta \ltwo{x - \what{x}}$ over $x \in \R^n$.
    \end{enumerate}
  \end{theorem}
  \begin{enumerate}[(a)]
  \item Define the sublevel-like set
    \begin{equation*}
      S \defeq \left\{x \in \R^n \mid f(x) + \delta \ltwos{x - x_0} \le f(x_0)
      \right\}.
    \end{equation*}
    Argue that $S$ is non-empty and compact and hence
    a minimizer of $\what{x} \in S$ of $f$ on $S$ exists.
  \item Show that
    \begin{equation*}
      \ltwos{\what{x} - x_0} \le \frac{\epsilon}{\delta}
      ~~ \mbox{and} ~~
      f(\what{x}) \le f(x_0).
    \end{equation*}
  \item Finalize the proof of the theorem to demonstrate that
    $\what{x}$ as above satisfies its conclusions.
  \item Give an interpretation of Theorem~\ref{theorem:ekeland}.
    (There is no uniquely correct answer for this.)
  \end{enumerate}
\end{question}

\newcommand{\gradmap}{\mathsf{G}_\stepsize}
\newcommand{\frechet}{\partial^{\textsc{f}}}

For the final problem, we require a few new definitions.
A function $f : \R^n \to \R$ is $\lambda$-\emph{weakly convex} if
for some $x_0 \in \R^n$, the function
\begin{equation*}
  f(x) + \frac{\lambda}{2} \ltwo{x - x_0}^2
\end{equation*}
is convex in $x$. (Convince yourself that the choice of $x_0$ is
immaterial here.) The Fr\'{e}chet subdifferential $\frechet f(x)$ of such
a function consists of those vectors $g$ for which
\begin{equation*}
  f(y) \ge f(x) + \<g, y - x\> - O(\norm{y - x}^2)
\end{equation*}
as $y \to x$, which is equivalent in the $\lambda$-weakly convex case
(this is not completely trivial)
to the set of vectors
\begin{equation*}
  g \in \partial_z \left.\left\{f(z) + \frac{\lambda}{2}
  \ltwo{z - x}^2\right\}\right|_{z = x},
\end{equation*}
that is, the regular subgradient of the convex function $z \mapsto f(z) +
\frac{\lambda}{2} \ltwo{z - x}^2$ evaluated at $z = x$.  Note that in this
case, if the function $f$ in Ekeland's variational principle
(Theorem~\ref{theorem:ekeland}) is weakly convex, then the final part
becomes equivalent to the claim that
\begin{equation*}
  0 \in \frechet f(\what{x}) + \delta \ball_2^n.
\end{equation*}

\begin{question}[Gradient mappings and weakly convex minimization]
  \label{question:gradient-maps}
  \begin{enumerate}[(a)]
  \item Give an example of a convex function $f : \R \to \R$ for which
    $|f'(x)| \ge 1$ except when $x = \argmin_x f(x)$, so that even in the
    convex case, we would not (generally) expect algorithms to find points
    with small subgradients.
  \end{enumerate}
  \begin{enumerate}[(a)]
    \setcounter{enumi}{1}
  \item Let $f(x) = h(c(x))$, where $h$ is convex and $L_h$-Lipschitz
    and $c$ has $L_c$-Lipschitz derivative. Show that
    $f$ is $L_c L_h$-weakly convex, and even more, that
    \begin{equation*}
      \frechet f(x) \supset \nabla c(x) \partial h(c(x)).
    \end{equation*}
    \emph{Hint.} It is enough to show that $f +
    \frac{M}{2} \ltwo{\cdot}^2$ is subdifferentiable.
  \item Recall that for a model $f_x$ of $f$ at the point $x$,
    we define the gradient mapping $\gradmap$ via
    \begin{equation*}
      x_\stepsize = \argmin_{x \in X}
      \left\{f_{x_0}(x) + \frac{1}{2 \stepsize} \ltwo{x - x_0}^2
      \right\}
      ~~ \mbox{and} ~~
      \gradmap(x_0) \defeq \frac{1}{\stepsize}
      (x_0 - x_\stepsize).
    \end{equation*}
    Let $f_x$ be convex and quadratically accurate, so that
    $|f_x(y) - f(y)| \le \frac{M}{2} \ltwo{x - y}^2$ for all $y$.
    Use Ekeland's variational principle (Theorem~\ref{theorem:ekeland})
    show that for any $x_0$ and $\stepsize > 0$,
    there exists $\what{x}$ satisfying
    \begin{enumerate}[i.]
    \item Point proximity: $\ltwo{\what{x} - x_\stepsize}
      \le \stepsize \ltwo{\gradmap(x_0)}$.
    \item Value proximity: $f(\what{x}) \le f(x_\stepsize)
      + \frac{M \stepsize^2 + \stepsize}{2} \ltwo{\gradmap(x_0)}^2$.
    \item Near stationarity: $\mbox{dist}(0, \frechet f(\what{x}))
      \le (2 M \stepsize + 1) \ltwo{\gradmap(x_0)}$.
    \end{enumerate}
    In short, once the gradient mapping $\gradmap(x_0)$ is small,
    there is a point $\what{x}$ near the updated point $x_\stepsize$
    that is nearly stationary for $f$.

    \emph{Hint.} Following \citet{DrusvyatskiyLe18}, define the function
    $\varphi(y) \defeq f(y) + \frac{M + \stepsize^{-1}}{2} \ltwo{y - x_0}^2$.
    Argue that $\varphi(x_\stepsize) - \varphi\opt \le M \ltwo{x_\stepsize
      - x_0}^2$, where $\varphi\opt = \inf_y \varphi(y)$.
    Then apply Ekeland's variational principle
    with $\epsilon = M \stepsize \ltwo{\gradmap(x_0)}^2$,
    and observe that the $\what{x}$ it guarantees satisfies
    $0 \in \frechet \varphi(\what{x}) + \delta \ball_2^n$,
    where $\frechet \varphi(x) = \frechet f(x) + \frac{M + \stepsize^{-1}}{2}
    (x - x_0)$.
  \end{enumerate}
\end{question}

\bibliography{bib364m}
\bibliographystyle{abbrvnat}


\end{document}