\classheader{2012-11-29} \subsection*{Independence of Random Variables} Let $X_1,\ldots,X_n$ be random variables. \begin{definition} Their joint distribution is the function \[F(t_1,\ldots,t_n)=P(X_1\leq t_1,\ldots,X_n\leq t_n).\] Their joint density, if it exists, is the function $f$ such that \[\int_Bf(x_1,\ldots,x_n)\,dx_1\cdots dx_n=P((X_1,\ldots,X_n)\in B)\] for all Borel $B\subseteq\R^n$. We define \[\mu(B)\overset{\text{def}}{=} P((X_1,\ldots,X_n)\in B).\] \end{definition} The following statements are equivalent: \begin{itemize} \item $X_1,\ldots,X_n$ are independent \item The $\sigma$-algebras $\cal{T}_1,\ldots,\cal{T}_n$ are independent \item $\mu=\mu_1\times\cdots\times \mu_n$ \item If the densities exist, $f(x_1,\ldots,x_n)=f_1(x_1)\cdots f_n(x_n)$ \item $P(X_1\in B_1,\ldots,X_n\in B_n)=P(X_1\in B_1)\cdots P(X_n\in B_n)$. \end{itemize} For any $X$ and $Y$, we know that $E(X+Y)=E(X)+E(Y)$ because the integral is additive. If $X$ and $Y$ are independent, it is also true that $E(XY)=E(X)E(Y)$. We can see this by looking at discrete events (simple functions): if $X=c_j$ with probability $p_j$ (we call this event $A_j$) and $Y=d_m$ with probability $q_m$ (we call this event $B_m$), then \[E(X)=\sum p_jc_j,\qquad E(Y)=\sum q_md_m,\] and because $P(A_j\text{ and }B_m)=p_jq_m$, we have \[E(XY)=\sum_{j,m}p_jq_mc_jd_m.\] We can then pass to arbitrary measurable functions in the standard way (take monotone limits of non-negative simple functions to get arbitrary non-negative measurable functions, then consider differences of non-negative measurable functions). \begin{homework} We've shown that independent random variables $X$ and $Y$ are orthogonal, i.e. they satisfy $E(XY)=E(X)E(Y)$. Is the converse true? \end{homework} \begin{proposition} If $X_1,\ldots,X_n$ are pairwise orthogonal, then \[\Var(X_1+\cdots+X_n)=\Var(X_1)+\cdots+\Var(X_n).\] \end{proposition} \begin{proof} Because $\Var(X)=\Var(X+c)$ for a constant $c$, we can assume that $E(X_j)=0$ for all $j$. Then \begin{align*} \Var(X_1+\cdots+X_n)&=E\big[(X_1+\cdots+X_n)^2\big]\\ &=E(\textstyle\sum X_j^2+2(\sum X_iX_j))\\ &=\textstyle\sum E(X_i^2) + 2\sum E(X_iX_j)\\ &=\textstyle\sum \Var(X_i) +2\underbrace{E(X_i)}_{=\,0}\underbrace{E(X_j)}_{=\,0}\\ &=\Var(X_1)+\cdots+\Var(X_n)\qedhere \end{align*} \end{proof} There are extreme cases where $\Var$ is not additive: \[\Var(X-X)=0,\qquad\Var(X+X)=4\Var(X).\] \begin{definition} When we are doing probability, we refer to convergence in measure as convergence in probability. Thus, $X_n\to X$ in probability when for all $\epsilon>0$, $P(|X_n-X|>\epsilon)\to 0$ as $n\to\infty$. \end{definition} \begin{definition} When we are doing probability, we refer to convergence a.e. as almost sure convergence. Thus, $X_n\to X$ almost surely when there exists an event $A$ with $P(A)=1$ such that $X_n(\omega)\to X(\omega)$ for all $\omega\in A$. \end{definition} \begin{homework} Suppose that $X_1,X_2,\ldots$ have the property that $E(X_n)\to \mu$ and $\Var(X_n)\to 0$. Show that $X_n\to \mu$ in probability, but not necessarily almost surely. \end{homework} \begin{definition} We say $X_1,X_2,\ldots$ are independent indentically distributed (i.i.d.) random variables when they are independent and have the same distribution. Intuitively, this means they are different occurrences of the same variable, e.g. $X_1=$ flipping a coin, $X_2=$ flipping the coin again, etc. Denoting $E(X_j)=\mu$ and $\Var(X_j)=\sigma^2$ for all $j$, we have that \[E\left(\frac{X_1+\cdots+X_n}{n}\right)=\mu,\qquad \Var\left(\frac{X_1+\cdots+X_n}{n}\right)=\frac{\sigma^2}{n}.\] \end{definition} \begin{theorem}[Weak law of large numbers] If $X_1,X_2,\ldots$ are i.i.d., then \[\frac{X_1+\cdots+X_n}{n}\to \mu\] in probability. \end{theorem} \begin{theorem}[Strong law of large numbers] If $X_1,X_2,\ldots$ are i.i.d., then \[\frac{X_1+\cdots+X_n}{n}\to \mu\] almost surely. \end{theorem} Recall that for a sequence of sets $A_1,A_2,\ldots$, we define $\limsup_{j\to\infty}A_j=\bigcap_{m=1}^\infty\bigcup_{j=m}^\infty A_j$. \begin{theorem}[Borel-Cantelli lemma] $\text{}$ \begin{enumerate} \item If $\sum P(A_j)<\infty$, then $P(\limsup_{j\to\infty} A_j)=0$. \item If $\sum P(A_j)=\infty$ and the $A_j$ are independent, then $P(\limsup_{j\to\infty}A_j)=1$. \end{enumerate} \end{theorem} \begin{proof}[Proof of 1] This is clear because the tail of a convergent sum goes to 0. \end{proof} \begin{proof}[Proof of 2] For any $m$, \[P(A_m^c\cap A_{m+1}^c\cap\cdots)\overset{\text{independence}}{=}\prod_{n=m}^\infty (1-P(A_n))=0\] because $\sum_{n=m}^\infty P(A_n)=\infty$. Then $A_m\cup A_{m+1}\cup\cdots$ happens almost surely, i.e. $P(\bigcup_{n=m}^\infty A_n)=1$, so that \[P\left(\bigcap_{m=1}^\infty\bigcup_{n=m}^\infty A_n\right)=1.\qedhere\] \end{proof} \begin{theorem}[Kolmogorov 0-1 Law] Let $X_1,X_2,\ldots$ be random variables. Define the $\sigma$-algebras \begin{align*} \cal{F}_1&=\sigma(X_1) & \cal{G}_1&=\sigma(X_2,X_3,\ldots)\\ \cal{F}_2&=\sigma(X_1,X_2) & \cal{G}_2&=\sigma(X_3,X_4,\ldots)\\ &\!\cdots & \text{} & \!\cdots\\[-0.3in] \end{align*} Then $\cal{F}_1\subseteq\cal{F}_2\subseteq\cdots$ and $\cal{G}_1\supseteq\cal{G}_2\supseteq\cdots$, but the limit $\cal{F}_0=\bigcup_{n=1}^\infty\cal{F}_n$ is only an algebra, not necessarily a $\sigma$-algebra. However, the limit $\cal{T}=\bigcap_{n=1}^\infty\cal{G}_n$ is a $\sigma$-algebra. If $A$ is measurable with respect to $\cal{T}$, then either $P(A)=0$ or $P(A)=1$. \end{theorem} \begin{lemma} Suppose that $\cal{F}^0$ is an algebra, and $\cal{F}$ is the $\sigma$-algebra generated by $\cal{F}^0$. Then for every $A\in\cal{F}$ and every $\epsilon>0$, there is some $B\in\cal{F}^0$ such that $P(A\symdiff B)<\epsilon$. \end{lemma} \begin{proof}[Proof of lemma] It will suffice to show that the collection $\cal{G}$ of sets $A$ which have this property is a $\sigma$-algebra. Trivially, we have that $\cal{F}^0\subseteq\cal{G}$. If $A\in\cal{G}$, then $A^c\in\cal{G}$ because $P(A\symdiff B)<\epsilon$ implies that $P(A^c\symdiff B^c)<\epsilon$, and $B^c\in\cal{F}^0$ because $B\in\cal{F}^0$. Now we need to show that if $A_1,A_2,\ldots\in \cal{G}$, then $\bigcup_{n=1}^\infty A_n\in\cal{G}$. First approach (from class): Choose $B_j\in\cal{F}^0$ such that $P(A_j\symdiff B_j)<\frac{\epsilon}{2^j}$. %This gives \[P\bigg(\bigcup_{j=1}^NA_j\symdiff \bigcup_{j=1}^N B_j\bigg)<2\epsilon.\] We know that \[P\bigg(\bigcup_{j=1}^NA_j\symdiff \bigcup_{j=1}^\infty B_j\bigg)<\epsilon\] if $N$ is large enough, and \[P\bigg(\bigcup_{j=1}^NB_j\symdiff \bigcup_{j=1}^\infty B_j\bigg)<\epsilon,\] so \[P\bigg(\bigcup_{j=1}^NA_j\symdiff \bigcup_{j=1}^N B_j\bigg)<2\epsilon,\] and hence \[\hphantom{(?)}\qquad P\bigg(\bigcup_{j=1}^\infty A_j\symdiff \bigcup_{j=1}^N B_j\bigg)<4\epsilon\qquad (?)\] proving the lemma. Second approach (not from class): Let $C=\bigcup_{n=1}^\infty A_n\in\cal{G}$. Let $\epsilon>0$, and choose an $m$ such that \[P\bigg(\bigcup_{j=1}^m A_j\bigg)\geq P(C)-\frac{\epsilon}{2}.\] For $j=1,\ldots,m$, choose $B_j\in\cal{F}^0$ such that $P(A_j\symdiff B_j)\leq \epsilon/2^{j+1}$. Let $B=\bigcup_{j=1}^m B_j$, and note that \[C\symdiff B \subseteq\bigg(\bigcup_{j=1}^m A_j\symdiff B_j\bigg)\cup\bigg(C\setminus\bigcup_{j=1}^m A_j\bigg),\] so that $P(C\symdiff B)<\epsilon$, and hence $C=\bigcup_{j=1}^m A_j\in\cal{G}$. \end{proof} \begin{proof}[Proof of Kolmogorov] We want to show that if $A\in\cal{T}$, then $P(A)=0$ or $P(A)=1$. For any $\epsilon>0$, choose a $B_\epsilon\in\bigcup_{j=1}^\infty\cal{F}_j$ (remember, this is only an algebra) such that $P(A\mathbin{\vartriangle} B_\epsilon)<\epsilon$. Then $B_\epsilon\in\cal{F}_n$ for some $n$; recall that $A\in \cal{G}_n$ for every $n$, and $\cal{F}_n$ and $\cal{G}_n$ are independent. Therefore, $P(A\cap B_\epsilon)=P(A)P(B_\epsilon)$. Also note that \[P(A\cap B_\epsilon)\geq P(A)-P(A\symdiff B_\epsilon)\geq P(A)-\epsilon.\] Thus, as $\epsilon\to 0$, \begin{center} \begin{tikzpicture}[every node/.style={outer sep=0,inner sep=0,text depth=0.3ex}, node distance=0.3em] \node (a) at (0,1) {$=$}; \node[left=of a.west] {$P(A\cap B_\epsilon)$}; \node[right=of a.east] {$P(A)P(B_\epsilon)$}; \node[below= 1 cmof a] (b) {$=$}; \node[left=of b.west] {$P(A)$}; \node[right=of b.east] {$P(A)P(A)$}; \draw[-angle 90,shorten >=6pt,shorten <=6pt] ([xshift=9ex] a.south) -- ([xshift=9ex] b.north); \draw[-angle 90,shorten >=6pt,shorten <=6pt] ([xshift=-5ex] a.south) -- ([xshift=-5ex] b.north); \end{tikzpicture} \end{center} %\begin{center} %\begin{tikzpicture} %\node (a) at (0,1.3) {$P(A)P(B_\epsilon)=P(A\cap B_\epsilon)$}; %\node (b) at (0,0) {$P(A)P(A)=P(A\cap A)$}; %\draw[->,>=angle 90,shorten >=3pt,shorten <=3pt] ([xshift=-3ex] a.south) to ([xshift=-3ex] b.north); %\draw[->,>=angle 90,shorten >=3pt,shorten <=3pt] ([xshift=7.8ex] a.south) to ([xshift=7.8ex] b.north); %\end{tikzpicture} %\end{center} hence $P(A)=P(A)^2$, hence $P(A)=0$ or $P(A)=1$. \end{proof} \begin{definition} We define the Fourier transform of a function $g$ to be \[\widehat{g}(z)=\int_{-\infty}^\infty e^{-ixz}\,g(x)\,dx.\] The inverse Fourier transform is \[g(x)=\frac{1}{2\pi}\int_{-\infty}^\infty e^{ixz}\,\widehat{g}(x)\,dx.\] \end{definition} \begin{definition} We say that $g$ is a Schwartz function when $g$ is $C^\infty$, and all of the derivatives $g^{(j)}(x)$ tend to 0 as $x\to\pm\infty$ faster than any polynomial. \end{definition} It turns out that if $g$ is Schwartz, then $\widehat{g}$ is Schwartz. % Csornyei relates that she once had a lecturer start talking about the Fourier transform with \pi = 1 = i because everyone divides by them in different combinations.