\documentclass{article}

\usepackage{amsmath,amssymb,amsthm}
\usepackage{csquotes}
\usepackage{dsfont}
\usepackage{thmtools}
\usepackage{etoolbox}
\usepackage{enumitem}
\usepackage{fullpage}
\usepackage{parskip}
\usepackage{xcolor}
\usepackage{verbatim}

\usepackage[colorlinks,linkcolor=red!80!black]{hyperref}
\usepackage[noabbrev,capitalize]{cleveref}

\definecolor{question}{rgb}{0,0,1}
\crefname{section}{Question}{Questions}
\newlist{qlist}{enumerate}{1}

\newif\iffirstq
\firstqfalse
\setlist[qlist,1]{%
    leftmargin=*,
    label=\textbf{\color{black}(\thesection.\arabic*)}, ref={(\thesection.\arabic*)},
}
\crefname{qlisti}{Question}{Questions}
\AtBeginEnvironment{qlist}{\color{question}}

\definecolor{answer}{rgb}{0,.5,0}
\newcommand{\ans}[1]{\par\textcolor{answer}{Answer: #1}}
\newenvironment{answer}{\par\begingroup\color{answer}Answer: }{\endgroup}

\definecolor{points}{rgb}{0.6,0.3,0}
\newcommand{\pts}[1]{\textcolor{points}{[#1~points]}}

\newcommand{\hint}[1]{\textcolor{black!60!white}{\emph{Hint: #1}}}
\newcommand{\meta}[1]{\textcolor{black!60!white}{\emph{#1}}}

\newcommand{\TODO}{\color{red}{TODO}}

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator{\bigO}{\mathcal{O}}
\newcommand{\cA}{\mathcal{A}}
\newcommand{\cH}{\mathcal{H}}
\newcommand{\cX}{\mathcal{X}}
\newcommand{\cY}{\mathcal{Y}}
\newcommand{\cZ}{\mathcal{Z}}
\DeclareMathOperator*{\E}{\mathbb{E}}
\newcommand{\eps}{\varepsilon}
\newcommand{\indic}{\mathds{1}}
\newcommand{\Lip}{\mathrm{Lip}}
\newcommand{\op}{\mathit{op}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\SG}{\mathcal{SG}}
\DeclareMathOperator{\spn}{span}
\newcommand{\tp}{^\mathsf{T}}
\DeclareMathOperator{\Tr}{Tr}
\newcommand{\ud}{\mathrm{d}}
\newcommand{\dist}{\mathcal{D}}
\newcommand{\xdist}{\mathcal{D}_x}

%%begin novalidate  % overleaf code check gives false positives here
\makeatletter
% \abs{} uses auto-sized bars, \abs*{} uses regular-sized ones
\newcommand{\abs}{\@ifstar{\@abs}{\@Abs}}
\newcommand{\@abs}[1]{\lvert #1 \rvert}
\newcommand{\@Abs}[1]{\left\lvert #1 \right\rvert}

\newcommand{\norm}{\@ifstar{\@norm}{\@Norm}}
\newcommand{\@norm}[1]{\lVert #1 \rVert}
\newcommand{\@Norm}[1]{\left\lVert #1 \right\rVert}

\newcommand{\ceil}{\@ifstar{\@ceil}{\@Ceil}}
\newcommand{\@ceil}[1]{\lceil #1 \rceil}
\newcommand{\@Ceil}[1]{\left\lceil #1 \right\rceil}

\newcommand{\floor}{\@ifstar{\@floor}{\@Floor}}
\newcommand{\@floor}[1]{\lfloor #1 \rfloor}
\newcommand{\@Floor}[1]{\left\lfloor #1 \right\rfloor}
\makeatother
%%end novalidate

\numberwithin{equation}{section}
\declaretheorem[name=Theorem,numberwithin=section]{thm}
\declaretheorem[name=Proposition,sibling=thm]{prop}

\begin{document}
\begin{center}
\Large
CPSC 532D, Fall 2024: Assignment 2
\\
due Friday, October 11 at 11:59pm
\end{center}

\vspace{2cm}


You can do this with a partner if you'd like (there's a ``find a group'' post on Piazza).
% Do assignment 1 alone; future ones will allow partners.
\textbf{Read the website section on academic integrity}
\href{https://www.cs.ubc.ca/~dsuth/532D/24w1/#policies}{here}
for what you're allowed to do and not do;
in particular, cite your sources (including people you talked to!)
and don't use ChatGPT/etc.
If you're not sure if something is okay, ask.

Prepare your answers to these questions using \LaTeX;
hopefully you're reasonably familiar with it,
but if not, try using Overleaf and looking around for tutorials online.
Feel free to ask questions if you get stuck on things on Piazza
(but remove any details about the actual answers to the questions\dots
make a private post if that's tough).
If you prefer, the \texttt{.tex} source for this file is available on the course website,
and you can put your answers in
\verb|\begin{answ|\verb|er} My answer here... \end{answer}|
% that weird formatting so my answer-stripping script doesn't catch it :p
environments to make them stand out;
feel free to delete whatever boilerplate you want.
Or answer in a fresh document.

Submit your answers as a single PDF on Gradescope:
\href{https://canvas.ubc.ca/courses/156968/external_tools/54329?display=borderless}{here's the link}.
Make sure to use the Gradescope group feature if you're working in a group.
You'll be prompted to mark where each question is in your PDF;
make sure you mark all relevant pages for each part (which saves us a surprising amount of grading time).

Please \textbf{put your name on the first page} as a backup, just in case.
If something goes wrong, you can also email your assignment to me directly (\texttt{dsuth@cs.ubc.ca}).


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\clearpage
\section{Questions you should probably get approximately correct \pts{40}}
\begin{qlist}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\item \pts{8}
  Let $\cA$ be an algorithm that agnostically PAC learns a hypothesis class $\cH$.
  Show that $\cA$ also (realizably) PAC learns $\cH$.

\begin{answer}\TODO\end{answer}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\item \pts{7} \color{black}
  Let $\cA : \cZ^* \to \cH$ be an algorithm
  and $\ell$ a loss
  such that
  there is some function
  $\eps : \mathbb N \times (0, 1) \to \R$
  such that
  for all $m \in \mathbb N$ and $\delta \in (0, 1)$,
  for all $\eps > \eps(m, \delta)$,
  it holds for all $\dist$ that
  \[
    \Pr_{S \sim \dist^m, \cA}\left( L_\dist(\cA(S)) - \inf_{h \in \cH} L_\dist(h) \ge \eps \right) \le 1 - \delta
  ,\]
  where the randomness is over both the choice of sample set $S$
  and any internal randomness in the algorithm $\cA$.
  Further suppose that
  $\eps(m, \delta)$ is nonincreasing in $m$ for each fixed $\delta \in (0, 1)$,
  and that $\lim_{m \to \infty} \eps(m, \delta) = 0$.
  \textcolor{question}{Show that $\cA$ agnostically PAC learns $\cH$.}

\begin{answer}\TODO\end{answer}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\item \pts{10}
    \textcolor{black}{Let $A$ be a learning algorithm, $\dist$ a probability distribution, and let $L$ denote the random variable $L_\dist(A(S))$ for some loss function bounded in $[0, 1]$.}
    Prove that the following two statements are equivalent:
    \begin{enumerate}
        \item For every $\eps, \delta > 0$, there is some $m(\eps, \delta)$ such that for all $m \ge m(\eps, \delta)$, $\Pr_{S \sim \dist^m}( L > \eps ) < \delta$.
        \item $\lim_{m \to \infty} \E_{S \sim \dist^m} L = 0$. \meta{The expected loss goes to zero asymptotically.}
    \end{enumerate}
\begin{answer}\TODO\end{answer}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\item \pts{15} \color{black}
  Consider data that is a set of binary attributes, $\cX \subseteq \{ (0, 1) \}^d$ for $d \ge 2$,
  and has $\cY = \{0, 1\}$.
  A \emph{binary decision tree} is a model that looks generally like
  \begin{itemize}
    \item If $x_3$, then:
    \begin{itemize}
        \item If $x_{12}$, then:
        \begin{itemize}
            \item Return 1
        \end{itemize}
        \item Otherwise (not $x_{12}$):
        \begin{itemize}
            \item If $x_1$, then
            \begin{itemize}
                \item Return 0
            \end{itemize}
            \item Otherwise (not $x_1$):
            \begin{itemize}
                \item Return 1
            \end{itemize}
        \end{itemize}
    \end{itemize}
    \item Otherwise (not $x_3$):
    \begin{itemize}
        \item Return 0
    \end{itemize}
  \end{itemize}
  Let $\cH$ be the set of all such trees of depth at most $k \le d$,
  and let $\hat h_S$ be any ERM in $\cH_k$.\footnote{This problem is NP-hard, but \href{https://systopia.cs.ubc.ca/decision_trees}{specialized algorithms} can usually solve an \emph{almost}-equivalent problem well in practice.}
  \textcolor{question}{Bound $\abs{\cH_k}$ as a function of $d$ and $k$, and hence show that ERM successfully PAC-learns this class for the zero-one loss based on results from class.}
  State the error bound or sample complexity (either is fine)
  with explicit constants,
  but then \textcolor{question}{give a $\bigO_p$ statement in terms of $m$, $k$, and $d$}
  (treating $\delta$ as a constant).

  \meta{You don't have to try to get the super-tightest-possible bound here, though of course you can if you want to. But also don't be absurdly loose (e.g.\ totally ignoring the depth-$k$ limitation).}

  \hint{A \emph{perfect binary tree} of depth $k$ has branches at every ``interior'' node, i.e.\ no ``early returns.'' Such a tree has $2^k$ leaf nodes at the bottom. One thing you can try is to map $\cH_k$ onto the set of perfect binary trees.}

  \hint{In this kind of bound, it's okay to think of $\cH$ as an input-output mapping: if two trees have different representations, but return the same value for every possible input in $\cX$, then you can think of them as the same hypothesis, because their value of $L_\dist - L_S$ must be the same.}
\begin{answer}\TODO\end{answer}

\end{qlist}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\clearpage
\section{Sums, means, and maxes of subgaussians \pts{50}}

In this question, we're going to explore subgaussians and different versions of Hoeffding's inequality some more.

\begin{qlist}

\item \label{q:sg:sum-dep-sqrt} \pts{10}
    Let $X_1 \in \SG(\sigma_1)$ and $X_2 \in \SG(\sigma_2)$; \textbf{do not} assume independence. 
    Show that $X_1 + X_2$ is $\SG(\sqrt{2} \sqrt{\sigma_1^2 + \sigma_2^2})$.

    \hint{One form of the ever-useful Cauchy-Schwarz inequality is that $\E[X Y] \le \sqrt{\E[X^2] \E[Y^2]}$, even if $X$ and $Y$ are dependent.}

    \begin{answer}\TODO\end{answer}

\item \label{q:sg:sum-dep} \pts{15}
    Let $X_1 \in \SG(\sigma_1)$ and $X_2 \in \SG(\sigma_2)$; \textbf{do not} assume independence. 
    Show that $X_1 + X_2$ is $\SG(\sigma_1 + \sigma_2)$.

    \hint{One way is to use H\"older's inequality: $\E[X Y] \le \E[X^p]^{1/p} \E[Y^q]^{1/q}$ for all $p, q \in [1, \infty]$ with $1/p + 1/q = 1$, i.e.\ $q = p / (p - 1)$. Do this for a general $p$, see what you get, then find the optimal $p$.}

    \begin{answer}\TODO\end{answer}

\item \pts{10}
    \textcolor{black}{
    Let $X_1, \dots, X_m$ each be $\SG(\sigma)$ with mean $\mu$,
    but do \emph{not} assume independence.}
    Construct a high-probability bound on their mean,
    \textcolor{black}{
    $\Pr\left( \frac1m \sum_{i=1}^m X_i > \mu + \text{something} \right) \le \delta$,
    using either \cref{q:sg:sum-dep-sqrt} or \ref{q:sg:sum-dep}
    rather than the notes' Proposition 3.6 (which assumed independence).}
    How much worse is what you just got than (Hoeffding') from the notes
    when the variables are actually independent,
    particularly in terms of its dependence on $m$?
    Could you have expected to get a better result,
    or can you construct a dependent example where this dependence on $m$ is necessary?

    \hint{One of these results is \emph{much} easier to use than the other one.}

    \begin{answer}\TODO\end{answer}

\item \pts{15}
    \meta{
    So far, we've only looked at means of a bunch of random variables.
    But for uniform convergence, we care about the \emph{worst-case} behaviour of errors.
    We're going to (or have already, depending on when you're reading this\dots{}) use the following result in a key way in class.
    }

    Let $X_1, \dots, X_m$ be zero-mean random variables that are each $\SG(\sigma)$;
    \textbf{do not} assume independence.\footnote{As far as I know, unlike for the mean, independence actually wouldn't help here.}
    Prove that
    \[
        \E\left[ \max_{i=1,\dots,m} X_i \right] \le \sigma \sqrt{2 \log(m)}
    .\]

    \hint{Bound $\exp(\lambda \E \max_i X_i)$ in terms of something that only depends on $m$, $\sigma$, and $\lambda$, by rearranging into a form that lets you plug in the definition of subgaussianity. Then turn that into a bound on $\E \max_i X_i$ in terms of $m$, $\sigma$, and $\lambda$. Then optimize $\lambda$ in that bound to get something only depending on $m$ and $\sigma$.}

    \hint{By Jensen's inequality, $\exp(\E Y) \le \E \exp(Y)$.}

    \hint{One way to upper-bound the max of a bunch of nonnegative numbers is by their sum.
    Although this might seem really loose, if the max is a lot bigger than the second-biggest number -- e.g.\ because they're on an exponential scale -- it's not too bad.}

    \begin{answer}\TODO\end{answer}
\end{qlist}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\clearpage
\section{Limits of Learning Lipschitz Laws \pts{10 challenge}}

So far, we've only seen covering number bounds based on covering norm balls in $\R^d$.
Let's use an analogous argument with a different kind of result.

Let $\cH = \{ h : [0, C]^d \to \R : h(0) = 0, \norm{h}_\Lip \le B \}$ for some $B \ge 0$,
where the Lipschitz constant is with respect to the usual Euclidean norms.
This is a nonparametric class
that includes ``a \emph{lot}'' of functions.

Consider the ``sup-norm''/uniform norm defined as
$\norm{f}_\infty = \sup_{x} \abs{f(x)}$,
which induces a metric
$\rho_\cH(h, g) = \norm{h - g}_\infty$.
(Recall that $h - g$ is the function that maps $x$ to $h(x) - g(x)$.)
It can be shown\footnote{%
    Example 5.10 of \href{https://go.exlibris.link/9ZMcv9J6}{Wainwright's book}
    shows a lower bound for $d = 1$, $C = 1$
    and points towards how to do the upper bound.
    Just afterwards, he mentions the $d > 1$ case is analogous.
    \meta{(Unfortunately, he only states it in $\asymp$ notation and I'm not totally sure whether the constant there is allowed to depend on $d$ or not. The version of \eqref{eq:covering-lip} is definitely valid -- see e.g.\ \href{https://ieeexplore.ieee.org/document/7944658}{Lemma 6 here} which bounds a more general case with explicit constants -- but I'm not certain it's necessary.)}
    To generalize to $C \ne 1$,
    consider that if $h : [0, C]^d \to \R$
    is $B$-Lipschitz,
    then $x \mapsto h(C x)$ is a $[0, 1]^d \to \R$ function
    which is $BC$-Lipschitz.
}
that the covering number of $\cH$ with respect to this $\rho_\cH$
satisfies
\begin{equation} \label{eq:covering-lip}
    \log N(\cH, \eta) \le \left( \frac{a B C}{\eta} \right)^d
\end{equation}
for some constant $a > 0$
and all $\eta$ small enough that the right-hand side is at least $1$.
Compare this to the $d \log\frac{3 B}{\eta}$ bound we saw for Euclidean balls.

Consider the absolute-value loss $\ell(h, (x, y)) = \abs{h(x) - y}$,
and suppose $\dist$ is such that $\Pr_{(x, y) \sim \dist}(\abs{y} \le Y) = 1$.
\textcolor{question}{Prove a high-probability bound on $\sup_{h \in \cH} L_\dist(\cH) - L_S(h)$}
with the best rate (in terms of $m$) you can find;
it should depend on $a$, $B$, $C$, $Y$, $d$, $m$, and the error probability $\delta$.
Prove this bound with explicit constants,
but then also \textcolor{question}{summarize it in a $\bigO_p$ statement}
treating everything but $m$ as a constant.
\textcolor{question}{Is the rate faster or slower than the logistic regression bound we saw in class?}

\hint{This proof ends up kind of long (at least mine did). Split it into appropriate sub-parts, and maybe define some helper variables along the way so your expressions don't get too unwieldy (but then expand out the final answer). Feel free to make simplifications that make things look nicer at the cost of making the constants worse, but try to get the $m$ dependence right.}

\hint{It's not possible to find the exactly optimal choice of $\eta$ here (when $d \ge 2$). You'll probably want to use $\sqrt{x + y} \le \sqrt x + \sqrt y$ before picking $\eta$, which gives a nicer bound anyway.}

\hint{The \href{https://en.wikipedia.org/wiki/Triangle_inequality\#Reverse_triangle_inequality}{reverse triangle inequality} is often useful.}

\meta{You don't have to repeat any portion of the argument which is verbatim identical to the notes, but you can. If you're not, be very clear about exactly what you've changed.}

\begin{answer}\TODO\end{answer}

\end{document}