\classheader{2012-11-29}

\subsection*{Independence of Random Variables}

Let $X_1,\ldots,X_n$ be random variables. 

\begin{definition}
Their joint distribution is the function
\[F(t_1,\ldots,t_n)=P(X_1\leq t_1,\ldots,X_n\leq t_n).\]
Their joint density, if it exists, is the function $f$ such that
\[\int_Bf(x_1,\ldots,x_n)\,dx_1\cdots dx_n=P((X_1,\ldots,X_n)\in B)\]
for all Borel $B\subseteq\R^n$. We define
\[\mu(B)\overset{\text{def}}{=} P((X_1,\ldots,X_n)\in B).\]
\end{definition}
The following statements are equivalent:
\begin{itemize}
\item $X_1,\ldots,X_n$ are independent
\item The $\sigma$-algebras $\cal{T}_1,\ldots,\cal{T}_n$ are independent
\item $\mu=\mu_1\times\cdots\times \mu_n$
\item If the densities exist, $f(x_1,\ldots,x_n)=f_1(x_1)\cdots f_n(x_n)$
\item $P(X_1\in B_1,\ldots,X_n\in B_n)=P(X_1\in B_1)\cdots P(X_n\in B_n)$.
\end{itemize}
For any $X$ and $Y$, we know that $E(X+Y)=E(X)+E(Y)$ because the integral is additive. If $X$ and $Y$ are independent, it is also true that $E(XY)=E(X)E(Y)$. We can see this by looking at discrete events (simple functions): if $X=c_j$ with probability $p_j$ (we call this event $A_j$) and $Y=d_m$ with probability $q_m$ (we call this event $B_m$), then
\[E(X)=\sum p_jc_j,\qquad E(Y)=\sum q_md_m,\]
and because $P(A_j\text{ and }B_m)=p_jq_m$, we have
\[E(XY)=\sum_{j,m}p_jq_mc_jd_m.\]
We can then pass to arbitrary measurable functions in the standard way (take monotone limits of non-negative simple functions to get arbitrary non-negative measurable functions, then consider differences of non-negative measurable functions).
\begin{homework}
We've shown that independent random variables $X$ and $Y$ are orthogonal, i.e. they satisfy $E(XY)=E(X)E(Y)$. Is the converse true?
\end{homework}

\begin{proposition}
If $X_1,\ldots,X_n$ are pairwise orthogonal, then
\[\Var(X_1+\cdots+X_n)=\Var(X_1)+\cdots+\Var(X_n).\]
\end{proposition}
\begin{proof}
Because $\Var(X)=\Var(X+c)$ for a constant $c$, we can assume that $E(X_j)=0$ for all $j$. Then
\begin{align*}
\Var(X_1+\cdots+X_n)&=E\big[(X_1+\cdots+X_n)^2\big]\\
&=E(\textstyle\sum X_j^2+2(\sum X_iX_j))\\
&=\textstyle\sum E(X_i^2) + 2\sum E(X_iX_j)\\
&=\textstyle\sum \Var(X_i) +2\underbrace{E(X_i)}_{=\,0}\underbrace{E(X_j)}_{=\,0}\\
&=\Var(X_1)+\cdots+\Var(X_n)\qedhere
\end{align*}
\end{proof}
There are extreme cases where $\Var$ is not additive:
\[\Var(X-X)=0,\qquad\Var(X+X)=4\Var(X).\]
\begin{definition}
When we are doing probability, we refer to convergence in measure as convergence in probability. Thus, $X_n\to X$ in probability when for all $\epsilon>0$, $P(|X_n-X|>\epsilon)\to 0$ as $n\to\infty$.
\end{definition}
\begin{definition}
When we are doing probability, we refer to convergence a.e. as almost sure convergence. Thus, $X_n\to X$ almost surely when there exists an event $A$ with $P(A)=1$ such that $X_n(\omega)\to X(\omega)$ for all $\omega\in A$.
\end{definition}
\begin{homework}
Suppose that $X_1,X_2,\ldots$ have the property that $E(X_n)\to \mu$ and $\Var(X_n)\to 0$. Show that $X_n\to \mu$ in probability, but not necessarily almost surely.
\end{homework}
\begin{definition}
We say  $X_1,X_2,\ldots$ are independent indentically distributed (i.i.d.) random variables when they are independent and have the same distribution. Intuitively, this means they are different occurrences of the same variable, e.g. $X_1=$ flipping a coin, $X_2=$ flipping the coin again, etc. Denoting $E(X_j)=\mu$ and $\Var(X_j)=\sigma^2$ for all $j$, we have that
\[E\left(\frac{X_1+\cdots+X_n}{n}\right)=\mu,\qquad \Var\left(\frac{X_1+\cdots+X_n}{n}\right)=\frac{\sigma^2}{n}.\]
\end{definition}
\begin{theorem}[Weak law of large numbers]
If $X_1,X_2,\ldots$ are i.i.d., then
\[\frac{X_1+\cdots+X_n}{n}\to \mu\]
in probability.
\end{theorem}
\begin{theorem}[Strong law of large numbers]
If $X_1,X_2,\ldots$ are i.i.d., then
\[\frac{X_1+\cdots+X_n}{n}\to \mu\]
almost surely.
\end{theorem}

Recall that for a sequence of sets $A_1,A_2,\ldots$, we define $\limsup_{j\to\infty}A_j=\bigcap_{m=1}^\infty\bigcup_{j=m}^\infty A_j$.

\begin{theorem}[Borel-Cantelli lemma]
$\text{}$

\begin{enumerate}
\item If $\sum P(A_j)<\infty$, then $P(\limsup_{j\to\infty} A_j)=0$.
\item If $\sum P(A_j)=\infty$ and the $A_j$ are independent, then $P(\limsup_{j\to\infty}A_j)=1$.
\end{enumerate}
\end{theorem}
\begin{proof}[Proof of 1]
This is clear because the tail of a convergent sum goes to 0.
\end{proof}
\begin{proof}[Proof of 2]
For any $m$,
\[P(A_m^c\cap A_{m+1}^c\cap\cdots)\overset{\text{independence}}{=}\prod_{n=m}^\infty (1-P(A_n))=0\]
because $\sum_{n=m}^\infty P(A_n)=\infty$. Then $A_m\cup A_{m+1}\cup\cdots$ happens almost surely, i.e. $P(\bigcup_{n=m}^\infty A_n)=1$, so that
\[P\left(\bigcap_{m=1}^\infty\bigcup_{n=m}^\infty A_n\right)=1.\qedhere\]
\end{proof}
\begin{theorem}[Kolmogorov 0-1 Law]
Let $X_1,X_2,\ldots$ be random variables. Define the $\sigma$-algebras
\begin{align*}
\cal{F}_1&=\sigma(X_1) & \cal{G}_1&=\sigma(X_2,X_3,\ldots)\\
\cal{F}_2&=\sigma(X_1,X_2) & \cal{G}_2&=\sigma(X_3,X_4,\ldots)\\
&\!\cdots & \text{} & \!\cdots\\[-0.3in]
\end{align*}
Then $\cal{F}_1\subseteq\cal{F}_2\subseteq\cdots$ and $\cal{G}_1\supseteq\cal{G}_2\supseteq\cdots$, 
but the limit $\cal{F}_0=\bigcup_{n=1}^\infty\cal{F}_n$ is only an algebra, not necessarily a $\sigma$-algebra. However, the limit  $\cal{T}=\bigcap_{n=1}^\infty\cal{G}_n$ is a $\sigma$-algebra.

If $A$ is measurable with respect to $\cal{T}$, then either $P(A)=0$ or $P(A)=1$.
\end{theorem}
\begin{lemma}
Suppose that $\cal{F}^0$ is an algebra, and $\cal{F}$ is the $\sigma$-algebra generated by $\cal{F}^0$. Then for every $A\in\cal{F}$ and every $\epsilon>0$, there is some $B\in\cal{F}^0$ such that $P(A\symdiff B)<\epsilon$.
\end{lemma}
\begin{proof}[Proof of lemma]
It will suffice to show that the collection $\cal{G}$ of sets $A$ which have this property is a $\sigma$-algebra. Trivially, we have that $\cal{F}^0\subseteq\cal{G}$. If $A\in\cal{G}$, then $A^c\in\cal{G}$ because $P(A\symdiff B)<\epsilon$ implies that $P(A^c\symdiff B^c)<\epsilon$, and $B^c\in\cal{F}^0$ because $B\in\cal{F}^0$.

Now we need to show that if $A_1,A_2,\ldots\in \cal{G}$, then $\bigcup_{n=1}^\infty A_n\in\cal{G}$.

First approach (from class): Choose $B_j\in\cal{F}^0$ such that $P(A_j\symdiff B_j)<\frac{\epsilon}{2^j}$.  %This gives \[P\bigg(\bigcup_{j=1}^NA_j\symdiff \bigcup_{j=1}^N B_j\bigg)<2\epsilon.\]
We know that
\[P\bigg(\bigcup_{j=1}^NA_j\symdiff \bigcup_{j=1}^\infty B_j\bigg)<\epsilon\]
if $N$ is large enough, and 
\[P\bigg(\bigcup_{j=1}^NB_j\symdiff \bigcup_{j=1}^\infty B_j\bigg)<\epsilon,\]
so
\[P\bigg(\bigcup_{j=1}^NA_j\symdiff \bigcup_{j=1}^N B_j\bigg)<2\epsilon,\]
and hence
\[\hphantom{(?)}\qquad P\bigg(\bigcup_{j=1}^\infty A_j\symdiff \bigcup_{j=1}^N B_j\bigg)<4\epsilon\qquad (?)\]
proving the lemma.

Second approach (not from class): Let $C=\bigcup_{n=1}^\infty A_n\in\cal{G}$. Let $\epsilon>0$, and choose an $m$ such that
\[P\bigg(\bigcup_{j=1}^m A_j\bigg)\geq P(C)-\frac{\epsilon}{2}.\]
For $j=1,\ldots,m$, choose $B_j\in\cal{F}^0$ such that $P(A_j\symdiff B_j)\leq \epsilon/2^{j+1}$. Let $B=\bigcup_{j=1}^m B_j$, and note that
\[C\symdiff B \subseteq\bigg(\bigcup_{j=1}^m A_j\symdiff B_j\bigg)\cup\bigg(C\setminus\bigcup_{j=1}^m A_j\bigg),\]
so that $P(C\symdiff B)<\epsilon$, and hence $C=\bigcup_{j=1}^m A_j\in\cal{G}$.
\end{proof}
\begin{proof}[Proof of Kolmogorov]
We want to show that if $A\in\cal{T}$, then $P(A)=0$ or $P(A)=1$.

For any $\epsilon>0$, choose a $B_\epsilon\in\bigcup_{j=1}^\infty\cal{F}_j$ (remember, this is only an algebra) such that $P(A\mathbin{\vartriangle} B_\epsilon)<\epsilon$. Then $B_\epsilon\in\cal{F}_n$ for some $n$; recall that $A\in \cal{G}_n$ for every $n$, and $\cal{F}_n$ and $\cal{G}_n$ are independent. Therefore, $P(A\cap B_\epsilon)=P(A)P(B_\epsilon)$. Also note that
\[P(A\cap B_\epsilon)\geq P(A)-P(A\symdiff B_\epsilon)\geq P(A)-\epsilon.\]
Thus, as $\epsilon\to 0$,
\begin{center}
\begin{tikzpicture}[every node/.style={outer sep=0,inner sep=0,text depth=0.3ex},
                    node distance=0.3em]
\node (a) at (0,1) {$=$};
\node[left=of a.west] {$P(A\cap B_\epsilon)$};
\node[right=of a.east] {$P(A)P(B_\epsilon)$};

\node[below= 1 cmof a] (b) {$=$};
\node[left=of b.west] {$P(A)$};
\node[right=of b.east] {$P(A)P(A)$};
\draw[-angle 90,shorten >=6pt,shorten <=6pt] ([xshift=9ex] a.south) -- ([xshift=9ex] b.north);
\draw[-angle 90,shorten >=6pt,shorten <=6pt] ([xshift=-5ex] a.south) -- ([xshift=-5ex] b.north);
\end{tikzpicture}
\end{center}

%\begin{center}
%\begin{tikzpicture}
%\node (a) at (0,1.3) {$P(A)P(B_\epsilon)=P(A\cap B_\epsilon)$};
%\node (b) at (0,0) {$P(A)P(A)=P(A\cap A)$};
%\draw[->,>=angle 90,shorten >=3pt,shorten <=3pt] ([xshift=-3ex] a.south) to ([xshift=-3ex] b.north);
%\draw[->,>=angle 90,shorten >=3pt,shorten <=3pt] ([xshift=7.8ex] a.south) to ([xshift=7.8ex] b.north);
%\end{tikzpicture}
%\end{center}
hence $P(A)=P(A)^2$, hence $P(A)=0$ or $P(A)=1$.
\end{proof}
\begin{definition}
We define the Fourier transform of a function $g$ to be
\[\widehat{g}(z)=\int_{-\infty}^\infty e^{-ixz}\,g(x)\,dx.\]
The inverse Fourier transform is
\[g(x)=\frac{1}{2\pi}\int_{-\infty}^\infty e^{ixz}\,\widehat{g}(x)\,dx.\]
\end{definition}
\begin{definition}
We say that $g$ is a Schwartz function when $g$ is $C^\infty$, and all of the derivatives $g^{(j)}(x)$ tend to 0 as $x\to\pm\infty$ faster than any polynomial.
\end{definition}
It turns out that if $g$ is Schwartz, then $\widehat{g}$ is Schwartz.

% Csornyei relates that she once had a lecturer start talking about the Fourier transform with \pi = 1 = i because everyone divides by them in different combinations.