\documentclass[11pt]{paper}

\setlength{\parindent}{0 pt}
\setlength{\parskip}{1.5 ex plus 0.5 ex minus 0.2 ex}

\usepackage{fullpage}
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{amsthm}
%\usepackage{wasysym}
\usepackage[usenames]{color}

\usepackage{enumerate}
\usepackage{cite}
\usepackage{float}
\usepackage{xspace}


\floatstyle{ruled}
\newfloat{algorithm}{thp}{alg}
\floatname{algorithm}{Algorithm}


%\usepackage {hyperref}

\newcommand{\mathset}[1]{\ensuremath {\mathbb {#1}}}
\newcommand{\N}{\mathset {N}}
\newcommand{\R}{\mathset {R}}
\newcommand{\Q}{\mathset {Q}}
\newcommand{\Z}{\mathset {Z}}
\newcommand{\EX}{\mathbf {E}}
\newcommand{\script}[1]{\ensuremath{\mathcal {#1}}}
\newcommand{\etal}{\textit {et al.}}
\newcommand{\eps}{\varepsilon}
\newcommand{\eqdef}{:=}
\newcommand{\boruvka}{Bor\r{u}vka}
\newcommand{\wspd}{\texttt{wspd}}
\DeclareMathOperator{\emst}{emst}
\DeclareMathOperator{\conv}{CH}
\DeclareMathOperator{\argmin}{argmin}
\DeclareMathOperator{\DT}{DT}
\DeclareMathOperator{\UC}{UC}
\DeclareMathOperator{\LC}{LC}
\newcommand{\DKL}{D_\textup{KL}}

\newtheorem{theorem}{Theorem}[section]
\newtheorem{problem}[theorem]{Problem}
\newtheorem{obs}[theorem]{Beobachtung}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{observation}[theorem]{Observation}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{definition}[theorem]{Definition}


\title{Chernoff Bounds}
\author{Wolfgang Mulzer}

\begin{document}
\maketitle

\section{The General Bound}

Let $P = (p_1, \dots, p_m)$ and $Q = (q_1, \dots, q_m)$ be two 
distributions 
on $m$ elements, i.e., $p_i, q_i \geq 0$, for $i=1, \dots, m$,  
and $\sum_{i=1}^{m} p_i = \sum_{i=1}^m q_i = 1$. The
\emph{Kullback-Leibler divergence} or \emph{relative entropy} of 
$P$ and $Q$ is defined as
\[
  \DKL(P \| Q) \eqdef \sum_{i=1}^m p_i \ln \frac{p_i}{q_i}.
\]
If $m = 2$, i.e., $P = (p, 1-p)$ and $Q = (q, 1-q)$, we also write
$\DKL(p \| q)$.
The Kullback-Leibler divergence provides a measure of distance between the
distributions $P$ and $Q$: it represents the expected loss of efficiency 
if we encode an $m$-letter alphabet with distribution $P$ with a code 
that is optimal
for distribution $Q$. We can now state the general form of the Chernoff 
Bound:
\begin{theorem}\label{thm:chernoff}
  Let $X_1, \dots, X_n$ be independent random variables with $X_i \in \{0,1\}$
  and $\Pr[X_i = 1] = p$, for $i = 1, \dots n$. Set $X \eqdef \sum_{i=1}^n X_i$.
  Then, for any $t \in [0, 1-p]$, we have
  \[
    \Pr[X \geq (p+t)n ] \leq e^{-\DKL(p+t \| p)n}.
  \]
\end{theorem}

\section{Four Proofs}\label{sec:proofs}

\subsection{The Moment Method}\label{sec:moment}

The usual proof of Theorem~\ref{thm:chernoff}  uses the exponential 
function $\exp$ and Markov's inequality. It is called \emph{moment method} 
because $\exp$ simultaneously encodes all \emph{moments} of $X$, 
i.e., $X$, $X^2$, $X^3$, etc.  The proof technique is very general and 
can be used to obtain several variants of Theorem~\ref{thm:chernoff}.
Let $\lambda > 0$ be a parameter to be determined later. We have
\[
  \Pr[X \geq (p+t)n ] = \Pr[\lambda X \geq \lambda (p+t)n ] = 
  \Pr\bigl[e^{\lambda X} \geq e^{\lambda (p+t)n} \bigr].
\]
From Markov's inequality, we obtain
\[
  \Pr\bigl[e^{\lambda X} \geq e^{\lambda (p+t)n} \bigr] \leq
    \frac{\EX[e^{\lambda X}]}{e^{\lambda (p+t)n}}.
\]
Now, the independence of the $X_i$ yields
\[
  \EX[e^{\lambda X}] = \EX\Bigl[e^{\lambda \sum_{i=1}^n X_i}\Bigr]
  =  \EX\Biggl[\prod_{i=1}^n e^{\lambda X_i}\Biggr]
  =  \prod_{i=1}^n \EX\Bigl[e^{\lambda X_i}\Bigr]
  =  \bigl(p e^\lambda + 1-p\bigr)^n.
\]
Thus, 
\begin{equation}\label{equ:lambdabound}
  \Pr[X >  (p+t)n] \leq
    \Bigl(\frac{pe^\lambda + 1-p}{e^{\lambda (p+t)}}\Bigr)^n,
\end{equation}
for every $\lambda > 0$. Optimizing for $\lambda$ using calculus,
we get that the right hand side is minimized if
\[
 e^\lambda = \frac{(1-p)(p+t)}{p(1-p-t)}.
\]
Plugging this into (\ref{equ:lambdabound}), we get
\[
  \Pr[X >  (p+t)n] \leq \Biggl[\Bigl(\frac{p}{p+t}\Bigr)^{p+t}
         \Bigl(\frac{1-p}{1-p-t}\Bigr)^{1-p-t}\Biggr]^n = 
      e^{-\DKL(p+t \| p)n},
\]
as desired.

\subsection{Chv\'atal's Method}\label{sec:chvatal}

Let $B(n,p)$ the random variable that gives the number of heads in 
$n$ independent Bernoulli trials with success probability $p$.
It is well known that
\[
\Pr[B(n,p) = l] = \binom{n}{l} p^l (1-p)^{n-l},
\]
for $l = 0, \dots, n$. Thus, for any $\tau \geq 1$ and $k \geq pn$, we get
\begin{multline*}
\Pr[B(n,p)\ge k] = \sum_{i=k}^n \binom{n}{i}p^i (1-p)^{n-i}\\ 
\leq \sum_{i=k}^n \binom{n}{i}p^i (1-p)^{n-i} 
\underbrace{\tau^{i-k}}_{\geq 1} +
\underbrace{\sum_{i=0}^{k-1} \binom{n}{i}p^i (1-p)^{n-i} 
\tau^{i-k}}_{\geq 0} 
= \sum_{i=0}^n \binom{n}{i}p^i (1-p)^{n-i} \tau^{i-k}.
\end{multline*}
Using the Binomial theorem, we obtain
\[
\Pr[B(n,p)\ge k] \leq
\sum_{i=0}^n \binom{n}{i}p^i (1-p)^{n-i} \tau^{i-k} = 
\tau^{-k}\sum_{i=0}^n \binom{n}{i}(p\tau)^{i} (1-p)^{n-i} =
\frac{(p\tau+1-p)^n}{\tau^k}.
\]
If we write $k = (p+t)n$ and $\tau = e^\lambda$, we can conclude
\[
\Pr[B(n,p)\ge (p+t)n] \leq
\Bigl(\frac{p e^\lambda+1-p}{e^{\lambda(p+t)}}\Bigr)^n.
\]
This is the same as (\ref{equ:lambdabound}), so we can complete the
proof of Theorem~\ref{thm:chernoff} as in Section~\ref{sec:moment}.

\subsection{The Impagliazzo-Kabanets Method}\label{sec:ik}

Let $\lambda \in [0,1]$ be a parameter to be chosen later. 
Let $I \subseteq \{1, \dots, n\}$ be a random index set obtained by
including each element $i \in \{1, \dots, n\}$ with probability 
$\lambda$.
We estimate 
$\Pr\bigl[\prod_{i \in I} X_i = 1\bigr]$ in two different ways, where
the probability is over the random choice of $X_1, \dots, X_n$ and $I$.

On the one hand, using the union bound and independence, we have
\begin{multline}\label{equ:ikupper}
\Pr\Bigl[\prod_{i \in I} X_i = 1\Bigr]
\leq \sum_{S \subseteq \{1, \dots, n\}} 
     \Pr\Bigl[I = S \wedge  \prod_{i \in S} X_i = 1\Bigr]
= \sum_{S \subseteq \{1, \dots, n\}} \Pr[I = S] \cdot  
    \prod_{i \in S} \Pr[ X_i = 1]\\
= \sum_{S \subseteq \{1, \dots, n\}} \lambda^{|S|}(1-\lambda)^{n-|S|} \cdot 
    p^{|S|}
= \sum_{s= 0}^{n} \binom{n}{s} (\lambda p)^{s}(1-\lambda)^{n-s}
= (\lambda p + 1 - \lambda)^n,
\end{multline}
by the Binomial theorem. On the other hand, 
by the law of total probability,
\[
\Pr\Bigl[\prod_{i \in I} X_i = 1\Bigr]
 \geq
\Pr\Bigl[\prod_{i \in I} X_i = 1 \mid X \geq (p+t)n\Bigr]\Pr[X \geq (p+t)n].
\]
Now, fix $X_1, \dots, X_n$ with $X \geq (p+t)n$. For the fixed choice
of $X_1 = x_1, \dots, X_n = x_n$, the probability 
$\Pr\bigl[\prod_{i \in I} x_i = 1\bigr]$ is exactly the probability that
$I$ avoids all the $n-X$ indices $i$ where $x_i = 0$.
Thus,
\[
\Pr\Bigl[\prod_{i \in I} x_i = 1\Bigr]
= (1-\lambda)^{n-X} \geq (1-\lambda)^{(1-p-t)n}.
\]
Since the bound holds uniformly for every choice of $x_1, \dots, x_n$ with
$X \geq (p+t)n$, we get
\[
\Pr\Bigl[\prod_{i \in I} X_i = 1 \mid X \geq (p+t)n\Bigr] \geq 
  (1-\lambda)^{(1-p-t)n},
\]
so
\[
\Pr\Bigl[\prod_{i \in I} X_i = 1\Bigr]
 \geq
(1-\lambda)^{(1-p-t)n}\Pr[X \geq (p+t)n].
\]
Combining with (\ref{equ:ikupper}),
\begin{equation}\label{equ:ikbound}
\Pr[X \geq (p+t)n] \leq 
\left(\frac{\lambda p + 1 - \lambda}{(1-\lambda)^{(1-p-t)}}\right)^n.
\end{equation}
Using calculus, we get that the right hand side is minimized for
$\lambda = t/(1-p)(p+t)$ (note that $\lambda \leq 1$ for $t \leq 1-p$).
Plugging this into (\ref{equ:ikbound}),
\[
  \Pr[X >  (p+t)n] \leq
    \Biggl[\Bigl(\frac{p}{p+t}\Bigr)^{p+t}
      \Bigl(\frac{1-p}{1-p-t}\Bigr)^{1-p-t}\Biggr]^n = 
      e^{-\DKL(p+t \| p)n},
\]
as desired.

\subsection{The Coding Theoretic Argument}\label{sec:encoding}

The next proof, due to Luc Devroye, G\'abor Lugosi, and Pat 
Morin, is inspired by coding theory.
Let $\{0,1\}^n$ be the set of all bit strings of length $n$,
and let $w: \{0, 1\}^n \rightarrow [0,1]$ be a
\emph{weight function}. We call $w$
\emph{valid} if $\sum_{x \in \{0,1\}^n} w(x) \leq 1$.
The following lemma says that for any probability distribution
$p_x$ on $\{0,1\}^n$, a valid weight function 
is unlikely to be substantially larger than $p_x$.
\begin{lemma}\label{lem:encoding}
Let $\mathcal{D}$ be a probability 
distribution on $\{0,1\}^n$ that assigns to each $x \in \{0,1\}^n$
a probability $p_x$, and let $w$ be a valid 
weight function. 
For any $s \geq 1$, we have 
\[
  \Pr_{x \sim \mathcal{D}}\left[w(x) \geq sp_x\right] 
     \leq 1/s.
\]
\end{lemma}

\begin{proof}
Let $Z_{s} = \{ x \in \{0,1\}^n \mid w(x) \geq sp_x\}$.
We have
\[
  \Pr_{x \sim \mathcal{D}}\left[w(x) \geq s p_x\right] 
  = \sum_{\substack{x \in Z_s \\ p_x > 0}}  p_x
  \leq \sum_{\substack{x \in Z_s \\ p_x > 0}} p_x \frac{w(x)}{sp_x}
  \leq  (1/s) \sum_{x \in Z_s} w(x) 
    \leq 1/s,
\]
since $w(x) / sp_x  \geq 1$ for $x \in Z_s$, $p_x > 0$,  and since
$w$ is valid.
\end{proof}

We now show that Lemma~\ref{lem:encoding} 
implies Theorem~\ref{thm:chernoff}. 
For this, we interpret the sequence $X_1, \dots, X_n$
as a bit string of length $n$. This induces a probability distribution 
$\mathcal{D}$ that assigns to each $x \in \{0, 1\}^n$ the 
probability 
$p_x = p^{k_x} (1-p)^{n-k_x}$, where $k_x$ denotes the number of
$1$-bits in $x$.
We define a weight function $w : \{0,1\}^n \rightarrow [0,1]$ by
$w(x) = (p+t)^{k_x}(1-p-t)^{n-k_x}$, for 
$x \in \{0,1\}^n$.
Then $w$ is valid, since $w(x)$ is the
probability that $x$ is generated by setting each bit 
to $1$ independently with probability $p+t$.
For $x \in \{0,1\}^n$, 
we have
\[
\frac{w(x)}{p_x}
=
\left(\frac{p+t}{p}\right)^{k_x}
\left(\frac{1-p-t}{1-p}\right)^{n - k_x}. 
\]
Since $((p+t)/p)((1-p)/(1-p-t)) \geq 1$, it follows
that $w(x)/p_x$ is an increasing function of $k_x$.
Hence, if $k_x \geq (p + t)n$, we have
\[
\frac{w(x)}{p_x}
\geq
\left[\left(\frac{p+t}{p}\right)^{p+t}
\left(\frac{1-p-t}{1-p}\right)^{1 - p-t}\right]^n
= e^{\DKL(p+t \| p)n}.
\]
We now apply Lemma~\ref{lem:encoding} to $\mathcal{D}$ and $w$ to get
\[
\Pr[X \geq (p+t)n] = \Pr_{x\sim \mathcal{D}} [ k(x) \geq (p+t)n]
\leq \Pr_{x \sim \mathcal{D}} \left[ w(x) \geq 
p_xe^{\DKL(p+t \| p)n}\right]
    \leq e^{-\DKL(p+t \| p)n},
\]
as claimed in Theorem~\ref{thm:chernoff}.

We provide some coding-theoretic background to explain the
intuition behind the proof.
A \emph{code} for $\{0, 1\}^n$ is an injective function 
$C: \{0, 1\}^n \rightarrow \{0, 1\}^*$.
The images of $C$ are called \emph{codewords}.
A code is called \emph{prefix-free} if no codeword is the 
prefix of another
codeword, i.e., for all $x, y \in \{0, 1\}^n$ with $x \neq y$, we 
have that if $|x| \leq |y|$, then $x$ and $y$ differ in at least 
one bit position. A prefix-free code
has a natural representation as a rooted binary tree in which the leaves 
correspond to elements of $\{0, 1\}^n$. Even though the codeword lengths
in a prefix-free code may vary, this structure 
imposes a restriction on the allowed
lengths. This is formalized in \emph{Kraft's inequality}.

\begin{lemma}[Kraft's inequality]
Let $C : \{0, 1\}^n \rightarrow \{0, 1\}^*$ be a prefix-free code. Then,
\[
  \sum_{x \in \{0, 1\}^n} 2^{-|C(x)|} \leq 1.
\]
Conversely, given a function $\ell: \{0, 1\}^n \rightarrow \N$ 
with
\[
  \sum_{x \in \{0, 1\}^n} 2^{-\ell(x)} \leq 1,
\]
there exists a prefix-free code $C : \{0, 1\}^n \rightarrow \{0, 1\}^*$ 
with $|C(x)| = \ell(x)$ for all $x \in \{0, 1\}^n$.
\end{lemma}

\begin{proof}
Let $m = \max_{x \in \{0,1\}^n} |C(x)|$, and let $y$ be random element of 
$y \in \{0, 1\}^m$. Then, for each $x \in \{0,1\}^n$, the probability that
$C(x)$ is a prefix of $y$ is exactly $2^{-|C(x)|}$. Furthermore, since
$C$ is prefix-free, these events are mutually exclusive. Thus,
\[
  \sum_{x \in \{0, 1\}^n} 2^{-|C(x)|} \leq 1,
\]
as claimed.

Next, we prove the second part. Let $m = \max_{x \in \{0,1\}^n} \ell(x)$
and let $T$ be a complete binary tree of height $m$. 
We construct $C$ according to the following algorithm: 
we set $X = \{0,1\}^n$, and we pick $x^* \in X$ with 
$\ell(x^*) = \min_{x \in X} \ell(x)$. Then we  select a node
$v \in T$ with depth $\ell(x^*)$. We assign to $C(x^*)$ the 
codeword of length $\ell$ that corresponds to $v$, and we 
remove $v$ and all its descendants
from $T$. This deletes exactly $2^{m-\ell(x^*)}$ leaves from
$T$. Next, we remove $x^*$ from $X$ and we repeat this procedure until 
$X$ is empty.
While $X \neq \emptyset$, we have
\[
  \sum_{x \in \{0, 1\}^n \setminus X} 2^{m-\ell(x)} < 2^m,
\]
so $T$ contains in each iteration at least one leaf and thus also
at least one node of depth $\ell(x^*)$. Since we assign the nodes
by increasing depth, and since all descendants of an assigned node
are deleted from the tree, the resulting code is prefix-free.
\end{proof}
 
Kraft's inequality shows that a prefix-free code $C$
induces a valid weight function $w(x) = 2^{-|C(x)|}$.
Thus, Lemma~\ref{lem:encoding} implies that for any
probability distribution $p_x$ on $\{0,1\}^n$ and for any prefix-free
code, the probability mass of the strings $x$
with codeword length $\log(1/p_x) -s$ is at most $2^{-s}$.
Now, if we set 
$\ell(x) = \lceil -k_x \log (p+t) - (n-k_x)\log (1-p-t)\rceil$ for
$x \in \{0,1\}^n$, the converse of Kraft's inequality shows that
there exists a prefix free code $C'$ with $|C'(x)| = \ell(x)$.
The calculation above shows that $C'$ saves roughly
$n(p+t)\log((p+t)/p) + n(1-p-t)\log((1-p-t)/(1-p))$ bits over 
$\log(1/p_x)$ for any $x$ with $k_x \geq (p+t)n$,
which almost gives the desired result.
We generalize to arbitrary valid weight functions
to avoid the slack introduced by the ceiling function.

\section{Useful Consequences}\label{sec:conseq}

\subsection{The Lower Tail}

\begin{corollary}\label{cor:chernoff_lower}
Let $X_1, \dots, X_n$ be independent random variables with 
$X_i \in \{0,1\}$ and $\Pr[X_i = 1] = p$, for $i = 1, \dots n$. 
Set $X \eqdef \sum_{i=1}^n X_i$. Then, for any $t \in [0, p]$, we have
\[
  \Pr[X \leq (p-t)n ] \leq e^{-\DKL(p-t \| p)n}.
\]
\end{corollary}

\begin{proof}
\begin{align*}
  \Pr[X \leq (p-t)n ] = \Pr[n-X \geq n-(p-t)n ] = \Pr[X' \geq (1-p+t)n ],
\end{align*}
where $X' = \sum_{i=1}^{n} X_i'$ with independent random variables
$X_i' \in \{0,1\}$ such that $\Pr[X_i' = 1]  = 1-p$. The 
result follows from $\DKL(1-p+t \| 1-p) = \DKL(p-t \| p)$.
\end{proof}

\subsection{Motwani-Raghavan version}

\begin{corollary}\label{cor:MR}
Let $X_1, \dots, X_n$ be independent random variables with 
$X_i \in \{0,1\}$ and $\Pr[X_i = 1] = p$, for $i = 1, \dots n$. Set 
$X \eqdef \sum_{i=1}^n X_i$ and
$\mu = pn$.  Then, for any $\delta \geq 0$, we have
\begin{align*}
  \Pr[X \geq (1+\delta)\mu ] &\leq 
    \left(\frac{e^{\delta}}{(1+\delta)^{1+\delta}}\right)^\mu,
  \text{ and}\\
  \Pr[X \leq (1-\delta)\mu ] &\leq 
   \left(\frac{e^{-\delta}}{(1-\delta)^{1-\delta}}\right)^\mu.
\end{align*}
\end{corollary}
\begin{proof}
Setting $t = \delta\mu/n$ in Theorem~\ref{thm:chernoff} yields
\begin{align*}
\Pr[X \geq (1+\delta)\mu] &\leq
\exp\left(-n\left[p(1+\delta)\ln(1+\delta) + 
p\left(\frac{1-p}{p}-\delta \right)\ln\left(1-\delta 
   \frac{p}{1-p}\right)\right]\right)\\
&= \left(\frac{(1-\delta p/(1-p))^{\delta - (1-p)/p}}{(1+\delta)^{1+\delta}}\right)^\mu\\
&\leq \left(\frac{e^{-\delta^2p/(1-p) + \delta}}{(1+\delta)^{1+\delta}}\right)^\mu
\leq \left(\frac{e^{\delta}}{(1+\delta)^{1+\delta}}\right)^\mu.
\end{align*}
Setting $t = \delta\mu/n$ in Corollary~\ref{cor:chernoff_lower} yields
\begin{align*}
\Pr[X \leq (1-\delta)\mu] &\leq
\exp\left(-n\left[p(1-\delta)\ln(1-\delta) + 
p\left(\frac{1-p}{p}+\delta \right)\ln\left(1+\delta \frac{p}{1-p}\right)\right]\right)\\
&= \left(\frac{(1+\delta p/(1-p))^{-\delta - (1-p)/p}}{(1-\delta)^{1-\delta}}\right)^\mu\\
&\leq \left(\frac{e^{-\delta^2p/(1-p) - \delta}}{(1-\delta)^{1-\delta}}\right)^\mu
\leq \left(\frac{e^{-\delta}}{(1-\delta)^{1-\delta}}\right)^\mu.
\end{align*}
\end{proof}

\subsection{Handy Versions}

\begin{corollary}\label{cor:handy_lower}
  Let $X_1, \dots, X_n$ be independent random variables with $X_i \in \{0,1\}$
  and $\Pr[X_i = 1] = p$, for $i = 1, \dots n$. Set $X \eqdef \sum_{i=1}^n X_i$ and
  $\mu = pn$.
  Then, for any $\delta \in (0, 1)$, we have
  \[
    \Pr[X \leq (1-\delta)\mu ] \leq e^{-\delta^2\mu/2}.
  \]
\end{corollary}
\begin{proof}
By Corollary~\ref{cor:MR}
\[    
  \Pr[X \leq (1-\delta)\mu ] \leq \left(\frac{e^{-\delta}}{(1-\delta)^{1-\delta}}\right)^\mu. 
\]
Using the power series expansion of $\ln(1-\delta)$, we get
\[
  (1-\delta)\ln(1-\delta) = -(1 - \delta) \sum_{i=1}^\infty \frac{\delta^i}{i}
  = -\delta + \sum_{i=2}^\infty \frac{\delta^i}{(i-1)i}
  \geq -\delta + \delta^2/2.
\]
Thus,
\[    
  \Pr[X \leq (1-\delta)\mu ] \leq e^{[-\delta + \delta-\delta^2/2]\mu} = 
    e^{-\delta^2\mu/2},
\]
as claimed.
\end{proof}

\begin{corollary}\label{cor:handy_upper}
  Let $X_1, \dots, X_n$ be independent random variables with $X_i \in \{0,1\}$
  and $\Pr[X_i = 1] = p$, for $i = 1, \dots n$. Set $X \eqdef \sum_{i=1}^n X_i$ and
  $\mu = pn$.
  Then, for any $\delta \geq 0$, we have
  \[
    \Pr[X \geq (1+\delta)\mu ] \leq e^{-\min\{\delta^2,\delta\}\mu/4}.
  \]
\end{corollary}
\begin{proof}
We may assume that $(1+\delta)p \leq 1$. Then Theorem~\ref{thm:chernoff} gives
  \[
    \Pr[X \geq (1+\delta)pn ] \leq e^{-\DKL((1+\delta)p \| p)n}.
  \]
Define $f(\delta) \eqdef \DKL((1+\delta)p \| p)$.
Then 
\[
  f'(\delta) = p \ln(1+\delta) -  p \ln (1-\delta p / (1-p))
\]
and
\[
  f''(\delta) = \frac{p}{(1+\delta)(1-p - \delta p)} \geq \frac{p}{1+\delta}. 
\]
By Taylor's theorem, we have
\[
  f(\delta) = f(0) + \delta f'(0) + \frac{\delta^2}{2} f''(\xi), 
\]
for some $\xi \in [0,\delta]$. Since $f(0) = f'(0) = 0$, it follows that
\[
  f(\delta) =  \frac{\delta^2}{2} f''(\xi) \geq \frac{\delta^2p}{2(1+\xi)} \geq
\frac{\delta^2p}{2(1+\delta)}.
\]
For $\delta \geq 1$, we have $\delta/(1+\delta) \geq 1/2$, for
$\delta < 1$, we have $1/(\delta+1) \geq 1/2$. This gives for all $\delta \geq 0$
\[
  f(\delta) \geq  \min\{\delta^2, \delta\}p/4,
\]
and the claim follows.
\end{proof}

\begin{corollary}
  Let $X_1, \dots, X_n$ be independent random variables with $X_i \in \{0,1\}$
  and $\Pr[X_i = 1] = p$, for $i = 1, \dots n$. Set $X \eqdef \sum_{i=1}^n X_i$ and
  $\mu = pn$.
  Then, for any $\delta > 0$, we have
  \[
    \Pr[|X - \mu| \geq \delta\mu ] \leq 2e^{-\min\{\delta^2, \delta\}\mu/4}.
  \]
\end{corollary}

\begin{proof}
Combine Corollaries~\ref{cor:handy_lower} and~\ref{cor:handy_upper}.
\end{proof}

\begin{corollary}
  Let $X_1, \dots, X_n$ be independent random variables with $X_i \in \{0,1\}$
  and $\Pr[X_i = 1] = p$, for $i = 1, \dots n$. Set $X \eqdef \sum_{i=1}^n X_i$ and
  $\mu = pn$. For $t \geq 2e\mu$, we have
  \[
    \Pr[X \geq t ] \leq 2^{-t}.
  \]
\end{corollary}

\begin{proof}
By Corollary~\ref{cor:MR}
\[    
  \Pr[X \geq (1+\delta)\mu ] \leq \left(\frac{e^{\delta}}{(1+\delta)^{1+\delta}}\right)^\mu 
    \leq \left(\frac{e}{1+\delta}\right)^{(1+\delta)\mu}.
\]
For $\delta \geq 2e-1$, the denominator in the right hand side is at least $2e$, and the
claim follows.
\end{proof}


\section{Generalizations}

We mention a few generalizations of the proof techniques
for Section~\ref{sec:proofs}. Since the consequences from
Section~\ref{sec:conseq} are based on simple algebraic manipulation
of the bounds, the same consequences also hold for the generalized
settings.

\subsection{Hoeffding-Extension}

\begin{theorem}\label{thm:chernoff_hoeff}
  Let $X_1, \dots, X_n$ be independent random variables with $X_i \in [0,1]$
  and $\EX[X_i] = p_i$.
  Set $X \eqdef \sum_{i=1}^n X_i$ and $p \eqdef (1/n)\sum_{i=1}^n p_i$.
  Then, for any $t \in [0, 1-p]$, we have
  \[
    \Pr[X \geq (p+t)n ] \leq e^{-\DKL(p+t \| p)n}.
  \]
\end{theorem}

\begin{proof}
  The proof generalizes the moment method.
  Let $\lambda > 0$ a parameter to be determined later. As before,
  Markov's inequality yields
  \[
  \Pr\bigl[e^{\lambda X} \geq e^{\lambda (p+t)n} \bigr] \leq
    \frac{\EX[e^{\lambda X}]}{e^{\lambda (p+t)n}}.
  \]
  Using independence, we get
  \begin{equation}\label{equ:boundExp}
    \EX[e^{\lambda X}] = \EX\Bigl[e^{\lambda \sum_{i=1}^n X_i}\Bigr]
    =  \prod_{i=1}^n \EX\Bigl[e^{\lambda X_i}\Bigr].
  \end{equation}
  Now we need to estimate $\EX\bigl[e^{\lambda X_i}\bigr]$.
  The function $z \mapsto e^{\lambda z}$ is convex,
  so $e^{\lambda z} 
  \leq (1-z) e^{0 \cdot \lambda} + z e^{1 \cdot \lambda}$ for $z \in [0,1]$.
  Hence,
  \[
    \EX\bigl[e^{\lambda X_i}\bigr] \leq \EX[1-X_i + X_i e^\lambda] = 
      1-p_i + p_ie^\lambda.
  \]
  Going back to (\ref{equ:boundExp}),
  \[
    \EX[e^{\lambda X}] \leq 
      \prod_{i=1}^n (1-p_i + p_i e^\lambda).
  \]
  Using the arithmetic-geometric mean inequality $\prod_{i=1}^n x_i \leq
    \bigl((1/n)\sum_{i=1}^n x_i\bigr)^n$, for $x_i \geq 0$, this is
  \[
    \EX[e^{\lambda X}] \leq (1-p+pe^\lambda)^n.
  \]
  From here we continue as in Section~\ref{sec:moment}.
\end{proof}

\subsection{Hypergeometric Distribution}

Chv\'atals proof generalizes to the \emph{hypergeometric} distribution.
\begin{theorem}\label{thm:hyper}
  Suppose we have an urn with $N$ balls, $P$ of which are red.
  We randomly draw $n$ balls from the urn without replacement.
  Let $H(N,P,n)$ denote the number of red balls in the sample.
  Set $p \eqdef P/N$.
  Then, for any $t \in [0, 1-p]$, we have
  \[
    \Pr[H(N,P,n) \geq (p+t)n ] \leq e^{-\DKL(p+t \| p)n}.
  \]
\end{theorem}

\begin{proof}
It is well known that
\[
\Pr[H(N,P,n) = l] = \binom{P}{l}\binom{N-p}{n-l}\binom{N}{l}^{-1},
\]
for $l = 0, \dots, n$. 
\begin{claim}\label{clm:hyper_bound}
For every $j \in \{0, \dots, n\}$, we have
\[
\binom{N}{n}^{-1} \sum_{i = j}^n \binom{P}{i}\binom{N-P}{n - i}\binom{i}{j}
\leq \binom{n}{j}p^j.
\]
\end{claim}
\begin{proof}
Consider the following random experiment: take a random permutation of the $N$
balls in the urn. Let $S$ be the sequence of the first $n$ elements in the permutation.
Let $X$ be the number of $j$-subsets of $S$ that contain only red balls.
We compute $\EX[X]$ in two different ways. On the one hand, 
\begin{equation}\label{equ:hyper_bound_var1}
\EX[X] = \sum_{i = j}^n \Pr[\text{S contains $i$ red balls}] \binom{i}{j}
  = \sum_{i = j}^n \binom{N}{n}^{-1} \binom{P}{i} \binom{N-P}{n-i} \binom{i}{j}.
\end{equation}
On the other hand, let $I \subseteq \{1, \dots, n\}$ with 
$|I| = j$. Then the probability that all the balls in the positions
indexed by $I$ are red is
\[
\frac{P}{N} \cdot \frac{P-1}{N-1} \cdot \cdots \cdot
\frac{P-j+1}{N-j+1} \leq \left(\frac{P}{N}\right)^j = p^j. 
\]
Thus, by linearity of expectation $\EX[X] \leq \binom{n}{j} p^j$. Together with
(\ref{equ:hyper_bound_var1}), the claim follows.
\end{proof}
\begin{claim}\label{clm:hyper_bound2}
For every $\tau \geq 1$, we have
\[
\binom{N}{n}^{-1} \sum_{i = 0}^n \binom{P}{i}\binom{N-P}{n - i}\tau^i
\leq (1 + (\tau-1)p)^n.
\]
\end{claim}
\begin{proof}
Using Claim~\ref{clm:hyper_bound} and the Binomial theorem (twice),
\begin{align*}
\binom{N}{n}^{-1} \sum_{i = 0}^n \binom{P}{i}\binom{N-P}{n - i}\tau^i
&= \binom{N}{n}^{-1} \sum_{i = 0}^n \binom{P}{i}\binom{N-P}{n - i}
  (1-(\tau-1))^i\\
&= \binom{N}{n}^{-1} \sum_{i = 0}^n \binom{P}{i}\binom{N-P}{n - i}
  \sum_{j=0}^i \binom{i}{j}(\tau-1)^j\\ 
&= \binom{N}{n}^{-1} \sum_{j = 0}^n (\tau-1)^j \sum_{i=j}^n 
  \binom{P}{i}\binom{N-P}{n - i}\binom{i}{j}\\
&\leq \sum_{j = 0}^n \binom{n}{j}((\tau-1)p)^j = (1 + (\tau-1)p)^n,
\end{align*}
as claimed.
\end{proof}

Thus, for any $\tau \geq 1$ and $k \geq pn$, we get as before
\begin{multline*}
\Pr[H(N,P,n)\geq k] = \binom{N}{n}^{-1}\sum_{i=k}^n \binom{P}{i}\binom{N-P}{n-i}\\
\leq \binom{N}{n}^{-1}\sum_{i=0}^n \binom{P}{i}\binom{N-P}{n-i}
\tau^{i-k}\leq
\frac{(p\tau+1-p)^n}{\tau^k},
\end{multline*}
by Claim~\ref{clm:hyper_bound2}. From here the proof proceeds as in 
Section~\ref{sec:chvatal}.
\end{proof}


\subsection{General Impagliazzo-Kabanets}
\begin{theorem}\label{thm:general_ik}
  Let $X_1, \dots, X_n$ be random variables with $X_i \in {0,1}$.
  Suppose there exist $p_i \in [0,1]$, $i = 1, \dots, n$, such 
  that for every index set $I \subseteq \{1, \dots, n\}$, we
  have $\Pr[\prod_{i \in I} X_i = 1] \leq \prod_{i \in I} p_i$.
  Set $X \eqdef \sum_{i=1}^n X_i$ and $p \eqdef (1/n)\sum_{i=1}^n p_i$.
  Then, for any $t \in [0, 1-p]$, we have
  \[
    \Pr[X \geq (p+t)n ] \leq e^{-\DKL(p+t \| p)n}.
  \]
\end{theorem}

\begin{proof}
Let $\lambda \in [0,1]$ be a parameter to be chosen later. 
Let $I \subseteq \{1, \dots, n\}$ be a random index set obtained by
including each element $i \in \{1, \dots, n\}$ with probability 
$\lambda$.
As before, we estimate the probability
$\Pr\bigl[\prod_{i \in I} X_i = 1\bigr]$ in two different ways, where
the probability is over the random choice of $X_1, \dots, X_n$ and $I$.
Similarly to before,
\begin{multline}\label{equ:ikupper_gen}
\Pr\Bigl[\prod_{i \in I} X_i = 1\Bigr]
=
\Pr\Bigl[\prod_{i \in I} X_i = 1\Bigr]
\leq \sum_{S \subseteq \{1, \dots, n\}} 
     \Pr\Bigl[I = S \wedge  \prod_{i \in S} X_i = 1\Bigr]\\
\leq \sum_{S \subseteq \{1, \dots, n\}} \Pr[I = S] \cdot  
    \Pr\Bigl[ \prod_{i \in S} X_i = 1\Bigr]
\leq \sum_{S \subseteq \{1, \dots, n\}} \lambda^{|S|}(1-\lambda)^{n-|S|} \cdot 
    \prod_{i \in S} p_i.
\end{multline}

We define $n$ independent random variables $Z_1, \dots, Z_n$ as follows: 
for $i = 1, \dots, n$, with probability $1-\lambda$,
we set $Z_i = 1$, and with probability $\lambda$, we set $Z_i = p_i$. By
(\ref{equ:ikupper_gen}), and using independence and the arithmetic-geometric
mean inequality.
\begin{equation}\label{equ:ikupper2_gen}
\Pr\Bigl[\prod_{i \in I} X_i = 1\Bigr]
= \EX\Bigl[\prod_{i = 1}^n Z_i\Bigr] = 
\prod_{i=1}^n\EX[Z_i]
=
\prod_{i=1}^n (1-\lambda + p_i \lambda)
\leq (1-\lambda + p \lambda)^n.
\end{equation}

The proof of the lower bound remains unchanged and yields
\[
\Pr\Bigl[\prod_{i \in I} X_i = 1\Bigr]
 \geq
(1-\lambda)^{(1-p-t)n}\Pr[X \geq (p+t)n],
\]
as before. Combining with (\ref{equ:ikupper2_gen}) and optimizing
for $\lambda$ finishes the proof, see Section~\ref{sec:ik}.
\end{proof}
\end{document}

