\documentclass{article}
\usepackage{amsmath,amssymb,amsthm}
\usepackage{csquotes}
\usepackage{dsfont}
\usepackage{thmtools}
\usepackage{etoolbox}
\usepackage{enumitem}
\usepackage{fullpage}
\usepackage{parskip}
\usepackage{xcolor}
\usepackage{verbatim}
\usepackage[colorlinks,linkcolor=red!80!black]{hyperref}
\usepackage[noabbrev,capitalize]{cleveref}
% https://tex.stackexchange.com/a/42728/9019
\newcommand{\tagthis}[1]{\stepcounter{equation}\tag{\theequation}\label{#1}}
\definecolor{question}{rgb}{0,0,1}
\newlist{qlist}{enumerate}{1}
\setlist[qlist,1]{leftmargin=*, label=\textbf{(\alph*)}, ref={(\alph*)}}
\crefname{qlisti}{part}{parts}
\AtBeginEnvironment{qlist}{\color{question}}
\newcommand{\ask}[1]{\textcolor{question}{#1}}
\newenvironment{asking}{\begingroup\color{question}}{\endgroup}
\crefname{section}{Question}{Questions}
\definecolor{answer}{rgb}{0,.5,0}
\newcommand{\ans}[1]{\par\textcolor{answer}{Answer: #1}}
\newenvironment{answer}{\par\begingroup\color{answer}Answer: }{\endgroup}
\definecolor{points}{rgb}{0.6,0.3,0}
\newcommand{\pts}[1]{\textcolor{points}{[#1~points]}}
\newcommand{\hint}[1]{\textcolor{black!60!white}{\emph{Hint: #1}}}
\newcommand{\meta}[1]{\textcolor{black!60!white}{\emph{#1}}}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\newcommand{\bepsilon}{\boldsymbol{\epsilon}}
\DeclareMathOperator{\bigO}{\mathcal{O}}
\newcommand{\cH}{\mathcal{H}}
\newcommand{\cI}{\mathcal{I}}
\newcommand{\cX}{\mathcal{X}}
\newcommand{\cY}{\mathcal{Y}}
\DeclareMathOperator*{\E}{\mathbb{E}}
\newcommand{\dist}{\mathcal{D}}
\newcommand{\eps}{\varepsilon}
\newcommand{\op}{\mathit{op}}
\newcommand{\NN}{\mathbb{N}}
\newcommand{\R}{\mathbb{R}}
\let\RR\R
\DeclareMathOperator{\Rad}{Rad}
\newcommand{\SG}{\mathcal{SG}}
\DeclareMathOperator{\sign}{sign}
\DeclareMathOperator{\spn}{span}
\newcommand{\tp}{^\mathsf{T}}
\DeclareMathOperator{\tr}{tr}
\DeclareMathOperator{\Unif}{Unif}
\DeclareMathOperator{\VCdim}{VCdim}
\newcommand{\xdist}{\mathcal{D}_x}
\newcommand{\ZZ}{\mathbb{Z}}
%%begin novalidate % overleaf code check gives false positives here
\makeatletter
% \abs{} uses regular-sized bars, \abs*{} uses auto-sized ones
\newcommand{\abs}{\@ifstar{\@Abs}{\@abs}}
\newcommand{\@abs}[1]{\lvert #1 \rvert}
\newcommand{\@Abs}[1]{\left\lvert #1 \right\rvert}
\newcommand{\norm}{\@ifstar{\@Norm}{\@norm}}
\newcommand{\@norm}[1]{\lVert #1 \rVert}
\newcommand{\@Norm}[1]{\left\lVert #1 \right\rVert}
\newcommand{\ceil}{\@ifstar{\@Ceil}{\@ceil}}
\newcommand{\@ceil}[1]{\lceil #1 \rceil}
\newcommand{\@Ceil}[1]{\left\lceil #1 \right\rceil}
\newcommand{\floor}{\@ifstar{\@Floor}{\@floor}}
\newcommand{\@floor}[1]{\lfloor #1 \rfloor}
\newcommand{\@Floor}[1]{\left\lfloor #1 \right\rfloor}
\makeatother
%%end novalidate % overleaf code check false-positives here
\declaretheorem[name=Theorem,numberwithin=section]{thm}
\declaretheorem[name=Proposition,sibling=thm]{prop}
\declaretheorem[name=Lemma,sibling=thm]{lemma}
\begin{document}
\begin{center}
\Large
CPSC 532D: Assignment 3 --
due Tuesday, 8 Nov 2022, \textbf{11:59pm}
\end{center}
As before:
use \LaTeX{}, either with the template I give or your own document if you prefer.
You can do this with a partner if you'd like (there's a ``find a group'' post on Piazza),
but please \textbf{make sure you understand everything you're submitting} --
don't just split an assignment in half.
If you do parts of the assignment with a partner and parts separately,
submit separate solutions, and say in each part you did together who you did it with.
If you look stuff up anywhere other than in \textbf{SSBD, MRT, Telgarsky, or Wainwright},
\textbf{cite your sources}:
just say in the answer to that question where you looked.
If you ask anyone else for help, \textbf{cite that too}.
Please do not look at solution manuals / search for people proving the things we're trying to prove / etc.
If you accidentally come across a solution while looking for something related,
still write the argument up in your own words, link to wherever you found it, and be clear about what happened.
\clearpage
\section{Monotonicity \pts{10}}
\begin{qlist}
\item \pts{3} \label{q:vc-monotone}
Prove that if $\cH \subseteq \cH'$, then $\VCdim(\cH) \le \VCdim(\cH')$.
\begin{answer}\textcolor{red}{TODO}\end{answer}
\item \pts{3} \label{q:rad-monotone}
Prove that if $\cH \subseteq \cH'$, then $\Rad(\cH\rvert_S) \le \Rad(\cH'\rvert_S)$.
\begin{answer}\textcolor{red}{TODO}\end{answer}
\item \pts{4}
Comment on how we should expect \cref{q:vc-monotone,q:rad-monotone} to affect the generalization loss of running ERM in $\cH$ versus $\cH'$, that is, $L_\dist(\operatorname{ERM}_{\cH}(S))$ versus $L_\dist(\operatorname{ERM}_{\cH'}(S))$ for a fixed $n$. What other factors are at play?
\begin{answer}\textcolor{red}{TODO}\end{answer}
\end{qlist}
\clearpage
\section{Threshold functions \pts{20}}
Recall our old friend, the class of threshold functions on $\R$:
\[
\cH = \left\{
x \mapsto \mathds 1(x \le \theta)
: \theta \in \R
\right\}
.\]
We showed in class (notes 6.1.1) that the VC dimension of $\cH$ is 1:
it can shatter a set of size one (a single point),
but it cannot shatter any set of size two (since it can't label the left point 0 and the right point 1).
\begin{qlist}
\item \label{q:thresh:sauer} \pts{5}
Use Sauer-Shelah (Lemma 6.8) and the (nicer) Corollary 6.9 to give two upper bounds on the growth function $\Pi_\cH(n)$.
\begin{answer}\textcolor{red}{TODO}\end{answer}
\item \label{q:thresh:tau} \pts{5}
Directly derive the exact value of the growth function $\Pi_\cH$ from its definition.
How tight are the upper bounds from \cref{q:thresh:sauer}?
\begin{answer}\textcolor{red}{TODO}\end{answer}
\item \label{q:thresh:rad-ub} \pts{5}
Plug the previous parts in to upper bound the empirical Rademacher complexity $\Rad(\cH\rvert_S)$
for an $S$ containing $n$ distinct real numbers.
\meta{You should give multiple bounds here, one per distinct bound from the previous parts.}
\begin{answer}\textcolor{red}{TODO}\end{answer}
\item \label{q:thresh:asymp} \pts{5}
Give the asymptotic value of $\Rad(\cH \rvert_S)$ for an $S$ containing $n$ distinct real numbers.
\textcolor{black}{Your answer might look something like ``$\Rad(\cH \rvert_S) = 7n + \bigO(1)$,'' with a justification. To be clear, this means that $7n - a_n \le \Rad(\cH\rvert_S) \le 7n + a_n$ for some $a_n = \bigO(1)$.}
How does it compare to the bound from \cref{q:thresh:rad-ub}?
\hint{
Imagine playing a (pretty boring) betting game where you bet \$1 whether a coin I'm flipping comes up heads or tails.
Since \href{http://www.stat.columbia.edu/~gelman/research/published/diceRev2.pdf}{all physical coin flips are unbiased},
you have a 50-50 shot of getting it right.
The distribution of how much money I owe you is known as a \emph{simple random walk}.
Your expected winnings at any time $t$ are always 0 (it's the sum of a bunch of mean-zero variables),
but if you play for a while and then go back and conveniently ``forget'' the record of flips after a certain point,
the expected \emph{maximum} value achieved at any point during a walk of length $n$
\href{https://arxiv.org/abs/cond-mat/0506195}{turns out to be}
$\sqrt{\frac{2 n}{\pi}} - \frac12 + \bigO(n^{-\frac12})$,
per (4) and (7) of the linked paper.
}
\begin{answer}\textcolor{red}{TODO}\end{answer}
\end{qlist}
\clearpage
\section{Piecewise-constant functions \pts{20 + 10 challenge + 5 bonus}}
Let $a = (a_1, a_2, \dots, a_k, 0, 0, \dots)$
be an eventually-zero sequence
with entries $a_i \in \{0, 1\}$.
Then define a hypothesis $h_a : \R_{> 0} \to \{0, 1\}$ by
\[
h_a(x) = a_{\ceil{x}} = \begin{cases}
a_1 & \text{if } 0 < x \le 1 \\
a_2 & \text{if } 1 < x \le 2 \\
&\vdots
\end{cases}
.\]
Consider the hypothesis class of all such functions:
$\cH = \{ h_a : \forall i \in \NN, a_i \in \{0, 1\} \text{ and } a \text{ is eventually zero} \}$.
\begin{qlist}
\item \label{q:const:vc} \pts{5} Show $\VCdim(\cH) = \infty$.
\begin{answer}\textcolor{red}{TODO}\end{answer}
\item \label{q:const:noshatter} \pts{8}
Give an example of a ``nontrivial'' distribution $\xdist$ on $\R_{> 0}$ where, for some $n < \VCdim(\cH)$, samples $S_x \sim \xdist^n$ have probability zero of being shattered by $\cH$.
\meta{``Nontrivial'' is of course a judgement call, but as an example, point masses at a single point are trivial, while, say, truncated normal distributions are not trivial.}
Thus prove that, for any $\dist$ with this $x$ marginal $\xdist$,
ERM over $\cH$ $(\eps,\delta)$-competes with the best hypothesis in $\cH$ for that $\dist$
with some finite sample complexity,
rather than the infinite sample complexity that would be implied by the VC bound.
\begin{answer}\textcolor{red}{TODO}\end{answer}
\item \label{q:const:srm} \pts{7}
Write $\cH = \cH_1 \cup \cH_2 \cup \cdots$ for each $\cH_k$ of a finite VC dimension,
and write down an explicit SRM algorithm that nonuniformly learns $\cH$.
\meta{By ``an explicit algorithm,'' I mean to expand out things like the uniform convergence bound for $\cH_k$; it's okay to write something as an argmin over $\cH$ (like in notes (7.2) if you say what $k_h$ is for a given $h$ and give the value of the Rademacher complexity term), or to just appeal to the SRM algorithm pseudocode from the notes (as long as you say what's in each $\cH_k$, what the $\eps_k$ functions are, and how to compute the stopping condition).}
\begin{answer}\textcolor{red}{TODO}\end{answer}
\item \label{q:const:hardsrm} \pts{5 bonus} \textbf{Bonus question:}
Suppose that instead of eventually-zero sequences,
we allowed all possible sequences $a$,
e.g.\ the $a$ that infinitely alternates between 0 and 1 could be an option.
Is this bigger $\cH'$ nonuniformly learnable?
\begin{answer}\textcolor{red}{TODO}\end{answer}
\item \label{q:const:radsmall} \pts{7} \textbf{Challenge question:}
Prove that, for \emph{any} $\xdist$,
$\E_{S_x \sim \xdist^n} \Rad(\cH\rvert_{S_x}) \to 0$
as $n \to \infty$.
\hint{One way to do it (there's probably more than one): first, reduce to the ``ceiled'' distribution over $\NN$ instead of over $\R_{>0}$, and use Corollary 4.8 to reduce to a bound in terms of $\E M / n$, where $M$ is the number of unique integers you've seen in $S$. Then prove that $\E M = o(n)$ for any distribution over $\NN$.}
\begin{answer}\textcolor{red}{TODO}\end{answer}
\item \label{q:const:oops} \pts{3} \textbf{Challenge question:}
An absentminded professor made the following argument on the \emph{final exam} for a course:
\begin{displayquote} \itshape
If a hypothesis class has $\E_{S_x \sim \xdist^n} \Rad(\cH\rvert_{S_x}) \to 0$ for all $\xdist$,
then for all realizable $\dist$,
\[ L_\dist(\hat h_S) \le \E_{S_X \sim \xdist^n} \Rad(\cH\rvert_{S_x}) + \sqrt{\frac{1}{2n} \log\frac1\delta} \to 0 .\]
Thus, by the ``fundamental theorem of statistical learning,'' $\cH$ must have finite VC dimension.
\end{displayquote}
Clearly this argument is wrong, since it puts \cref{q:const:vc,q:const:radsmall} in contradiction.
What was her mistake?
\begin{answer}\textcolor{red}{TODO}\end{answer}
\end{qlist}
\clearpage
\section{Generalization bound for a simple neural network \pts{40}}
\meta{Based on MRT exercise 3.11.}
Here is a class of neural networks mapping $\R^d$ to $\R$,
with one hidden layer of width $m$ and activations $\phi : \R \to \R$ a $1$-Lipschitz function (e.g.\ the ReLU or sigmoid function):
\[
\cH = \left\{
x \mapsto
\sum_{j=1}^m w_j \phi(u_j\tp x)
:
\norm{w}_1 \le \nu, \;
\norm{u_j}_2 \le \Lambda \text{ for each } j \in [m]
\right\}
.\]
We haven't used this in this class yet, but recall that $\norm{w}_1 = \sum_{j=1}^m \abs{w_j}$.
$\Lambda$ and $\nu$ are hyperparameters defining how complex the class is allowed to be.
\begin{qlist}
\item \pts{10}
Show that $\displaystyle \Rad(\cH\rvert_{S_x}) = \frac{\nu}{n} \E_{\bepsilon}\left[ \sup_{\norm{u}_2 \le \Lambda} \abs*{\sum_{i=1}^n \epsilon_i \phi(u\tp x_i)} \right]$.
\begin{answer}\textcolor{red}{TODO}\end{answer}
\end{qlist}
This variant of Talagrand's contraction lemma works for any $\cH$ and $\rho$-Lipschitz function $\Phi$:%
\footnote{
\color{black!60!white}
The MRT exercise claims that, for any $\cH$ and $\rho$-Lipschitz $\Phi$,
\begin{equation} \label{eq:tal:wrong}
\frac1n \E_{\bepsilon}\left[ \sup_{h \in \cH} \abs*{\sum_{i=1}^n \epsilon_i \Phi(h(x_i))} \right]
\le \frac{\rho}{n} \E_{\bepsilon}\left[ \sup_{h \in \cH} \abs*{\sum_{i=1}^n \epsilon_i h(x_i)} \right]
\tag{wrong}
.\end{equation}
This would be true if you dropped the absolute value bars, but the version as stated is not true.
For a counterexample,
consider the (not very interesting) hypothesis class $\cH = \{ x \mapsto 0 \}$
and the function $\Phi(x) = C$ which, as it ignores its argument, is $L$-Lipschitz for \emph{any} $L \ge 0$.
The LHS of \eqref{eq:tal:wrong} is $\abs{C}$ times
$\E_{\bepsilon} \abs*{\frac1n \sum_{i=1}^n \epsilon_i} > 0$.
The RHS of \eqref{eq:tal:wrong}, though, is exactly $0$,
since $h(x_i) = 0$ for all $x_i$.
This example also shows that assuming a symmetric $\cH$ would not be enough to fix \eqref{eq:tal:wrong}:
you would need $\Phi \circ \cH$ to be symmetric as well.
}
\begin{equation} \label{eq:tal}
\frac1n \E_{\bepsilon}\left[ \sup_{h \in \cH} \abs*{\sum_{i=1}^n \epsilon_i \Phi(h(x_i))} \right]
\le \frac{2 \rho}{n} \E_{\bepsilon}\left[ \sup_{h \in \cH} \abs*{\sum_{i=1}^n \epsilon_i h(x_i)} \right] + \frac{\abs{\Phi(0)}}{\sqrt n}
\tag{*}
.\end{equation}
\begin{qlist}[resume]
\item \pts{10}
Use \eqref{eq:tal} to upper bound $\Rad(\cH\rvert_{S_x})$ in terms of $\Rad(\cH'\rvert_{S_x})$, for
\[
\cH'
= \big\{ x \mapsto s \, (u\tp x) : \norm{u}_2 \le \Lambda, \, s \in \{-1, +1\} \big\}
= \big\{ x \mapsto u\tp x : \norm{u}_2 \le \Lambda \}
.\]
\begin{answer}\textcolor{red}{TODO}\end{answer}
\item \pts{10}
Bound $\E_{S \sim \xdist^n} \Rad(\cH'\rvert_S)$, and thereby $\E_{S \sim \xdist^n} \Rad(\cH \rvert_S)$.
\textcolor{black}{You'll need an assumption on $\norm x$ from $\xdist$ to do this;} be clear what you're assuming.
\begin{answer}\textcolor{red}{TODO}\end{answer}
\item \pts{10}
Give an expression for $\varepsilon$
such that
$\displaystyle
\Pr_{S \sim \dist^n}\left(
\sup_{h \in \cH} \left( L_\dist(h) - L_S(h) \right)
\le \varepsilon
\right) \ge 1 - \delta$,
\begingroup\color{black}
where you'll need to choose some loss function $\ell$ for inside $L$,
and be clear about any additional assumptions.
\textcolor{red}{Please choose a \emph{specific} loss function $\ell$; if it's not one we've used in class, be sure to justify it as a ```reasonable'' loss function.}
There are several valid approaches here.
Try to pick a reasonable set of assumptions and loss function;
something like ``all the $y$s are equal to $0$'' is not reasonable.
Your bound should have $\varepsilon \to 0$ as $n \to \infty$ with all other parameters fixed.
\endgroup
\begin{answer}\textcolor{red}{TODO}\end{answer}
\end{qlist}
\end{document}