\classheader{2012-11-27}

Today we'll be starting probability. We'll mainly discuss some terminology and notations.

\begin{definition}
We say that $(\Omega,\cal{F},P)$ is a probability space when it is measure space ($\Omega$ being the underlying set, $\cal{F}$ the $\sigma$-algebra, and $P$ the measure) such that $P(\Omega)=1$. We say that $\Omega$ is the set of outcomes, $\cal{F}$ is the $\sigma$-algebra of events, and $P$ is the probability measure.
\end{definition}
\begin{examples}
$\text{}$

\begin{itemize}
\item We can make a probability space representing flipping a coin $n$ times. We let $\Omega=\{H,T\}^n$, $\cal{F}=$ all subsets of $\Omega$, and $P(\omega)=\frac{1}{2^n}$ for each $\omega\in\Omega$.
\item Now suppose we are flipping a coin infinitely many times. Then $\Omega$ consists of all infinite sequences of $H$ and $T$, and $\cal{F}$ is the $\sigma$-algebra generated by cylinder sets, e.g.
\[\{(x_1,x_2,\ldots)\in\Omega\mid x_1=H,x_2=T,\ldots,x_n=H\}.\]
Finally, we define $P$ to be the product measure $(\frac{1}{2}\delta_H+\frac{1}{2}\delta_T)^\N$, or in other words the measure such that $P(C)=\frac{1}{2^n}$ when $C$ is a cylinder set in which the first $n$ coordinates have been fixed.
\end{itemize}
\end{examples}

\begin{definition}
A random variable $X$ is a function $X:\Omega\to(-\infty,\infty)$ such that $X^{-1}(B)\in\cal{F}$ for every Borel $B\subseteq\R$. We define
\[\mu_X(B)\overset{\text{def}}{=}P(X\in B)=P(X^{-1}(B)),\]
which specifies a measure on $\R$. We say that this is the distribution of $X$. The distribution function of $X$ is the function defined by $F(x)=\mu_X((-\infty,x])=P(X\leq x)$.
\end{definition}

Here are some basic properties of the distribution function.
\begin{itemize}
\item $\lim\limits_{x\to\infty}F(x)=1$.
\item $\lim\limits_{x\to-\infty}F(x)=0$.
\item $F$ is monotone increasing.
\item $F$ is right continuous, but not necessarily left continuous:
\[\lim_{\epsilon\to 0^+}F(x+\epsilon)=F(x)\neq\lim_{\epsilon\to 0^-}F(x+\epsilon).\]
\end{itemize}
It turns out these properties actually characterize the functions $F$ which are distribution functions. We can see this by defining
\[\mu_X((-\infty,x])\overset{\text{def}}{=}F(x),\]
then extending $\mu_X$ to all Borel sets.

\begin{definition}
If there is a function $f:\R\to[0,\infty)$ such that $P(a\leq x\leq b)=\int_a^b f$, we say that $f$ is the density of $X$. If it exists, we then have that $\int_{-\infty}^\infty=1$, and if $f$ is continuous at $x$ then $f(x)=F'(x)$.
\end{definition}

\begin{definition}
The characteristic function of an event $E\in\cal{F}$, i.e. the function
\[\chi_E(\omega)=\begin{cases}
1 & \text{ if }\omega\in E,\\
0 & \text{ if }\omega\notin E,
\end{cases}\]
is called an indicator function when we are doing probability.
\end{definition}
\begin{examples}
$\text{}$

\begin{itemize}
\item Returning to flipping coins, we let $(\Omega,\cal{F},P)$ represent flipping a coin infinitely many times. Define
\[X_n(\omega_1,\omega_2,\ldots)=\omega_n=\begin{cases}
1&\text{ if }n\text{th flip is head},\\
0 & \text{ if }n\text{th flip is tail,}
\end{cases}\]
and let $S_n=X_1+\cdots+X_n$. Then $S_n$ is the number of heads in the first $n$ flips. Define $\cal{F}_n$ to be the $\sigma$-algebra of events that depend only on the first $n$ flips. Then $S_n$ is also a random variable on $(\Omega,\cal{F}_n,P)$, but $S_{n+1}$ is not.
\item Let $\mu$ be a probability measure on $(\R,\text{Borel})$. Consider the random variable $X:\R\to\R$ which is the identity function on $\R$. Then $\mu_X=\mu$.
\item We say that $X$ has normal distribution with mean $\mu$ and variance $\sigma^2$ if $X$ has density
\[f(x)=\frac{1}{\sqrt{2\pi\sigma^2}}\;e^{-(x-\mu)^2/2\sigma}.\]
In particular, the standard normal distribution ($\mu=0,\sigma=1$) has density $f(x)=\frac{1}{\sqrt{2\pi}}\;e^{-x^2/2}$, so that
\[\Phi(x)=\int_{-\infty}^x\frac{1}{\sqrt{2\pi}}\;e^{-t^2/2}\,dt.\]
\item If $X$ is a random variable and $g:\R\to\R$ is a Borel-measurable function, then $Y=g(X)$ is also a random variable.
\item The Cantor function\vspace{0.1in}
\begin{center}
\begin{tikzpicture}[scale=2]
\draw[thick] (-1,0) to node[midway,label={\tiny $0$}] {} (0,0);
\draw[thick] (1,1) to node[midway,label={\tiny $1$}] {} (2,1);
\draw[thick] ($(1/3,0.5)$) node (a) {}  to node[midway,label={\tiny $\frac{1}{2}$}] {} ($(2/3,0.5)$) node (b) {};
\draw[thick] ($(1/9,0.25)$) node (c) {} to node[midway,label={\tiny $\frac{1}{4}$}] {} ($(2/9,0.25)$) node (d) {};
\draw[thick] ($(7/9,0.75)$) node (e) {} to node[midway,label={\tiny $\frac{3}{4}$}] {} ($(8/9,0.75)$) node (f) {};
\draw[thin] (-1,0) to (2,0);
\draw[thin,dotted] (a.center) --++ (0,-0.5) node [label={[shift={(0,-0.7)}]\tiny $\frac{1}{3}$}] {};
\draw[thin,dotted] (b.center) --++ (0,-0.5) node [label={[shift={(0,-0.7)}]\tiny $\frac{2}{3}$}] {};
\draw[thin,dotted] (c.center) --++ (0,-0.25) node [label={[shift={(0,-0.7)}]\tiny $\frac{1}{9}$}] {};
\draw[thin,dotted] (d.center) --++ (0,-0.25) node [label={[shift={(0,-0.7)}]\tiny $\frac{2}{9}$}] {};
\draw[thin,dotted] (e.center) --++ (0,-0.75) node [label={[shift={(0,-0.7)}]\tiny $\frac{7}{9}$}] {};
\draw[thin,dotted] (f.center) --++ (0,-0.75) node [label={[shift={(0,-0.7)}]\tiny $\frac{8}{9}$}] {};
\draw[thin,dotted] (1,1) to (1,0) node [label={[shift={(0,-0.63)}]\tiny $1$}] {};
\node [label={[shift={(0,-0.63)}]\tiny $0$}] at (0,0) {};
\end{tikzpicture}
\end{center}
is a continuous function from $\R$ to $[0,1]$, and it is a distribution of some random variable (it meets all of the criteria we set above). It has no atoms (i.e., no points of positive measure), and no density function. Its distribution is also not absolutely continuous with respect to the Lebesgue measure, because the Cantor function maps the Cantor set (which has Lebesgue measure 0) onto $[0,1]$ (which has positive Lebesgue measure).
\end{itemize}
\end{examples}
\begin{definition}
The expected value $E(X)$ of a random variable $X$ is defined to be
\[E(X)=\int X\,dP=\int x\,d\mu_X.\]
However, $E(X)$ may not exist, because the function could approach both positive and negative infinity.
\end{definition}

It is easy to see that for any Borel-measurable $g$, we have
\[E(g(X))=\int g(x)\,d\mu_X,\]
and that if $X$ has density function $f$, we have
\[E(g(X))=\int f(x)g(x)\,dx.\]

\begin{definition}
The variance $\text{Var}(X)$ of a random variable $X$ is defined to be
\begin{align*}
\text{Var}(X) &= E((X-E(X))^2)\\
&=E(X^2-2X\cdot E(X)+E(X)^2)\\
&=E(X^2)-2E(X)E(X)+E(X)^2\\
&=E(X^2)-E(X)^2.
\end{align*}
The variance is often denoted by $\sigma^2$, so that $\sigma=\sqrt{\text{Var}(X)}$ (you can see that $\text{Var}(X)$ is non-negative via Cauchy-Schwarz, Jensen's inequality, or simply by noting that $\text{Var}(X)$ is the expected value of something non-negative).
\end{definition}

\begin{theorem}[Markov's inequality]
For any random variable $X$, we have
\[P(|X|\geq c)\leq\frac{E(|X|)}{c}.\]
\end{theorem}
\begin{proof}
Define a new random variable $X_c$ by $X_c=c\chi_{|X|\geq c}$. It is easy to see that $X_c\leq|X|$. Now note that
\[c\cdot P(|X|\geq c)=c\cdot P(X_c=c)=E(X_c)\leq E(|X|).\qedhere\]
\end{proof}
\begin{theorem}[Chebyshev's inequality]
For any random variable $X$, we have
\[P(|X-E(X)|\geq c)\leq\frac{\mathrm{Var}(X)}{c^2}.\]
\end{theorem}
\begin{proof}
Applying Markov, we immediately get that
\[P(|X-E(X)|^2\geq c^2)\leq\frac{\text{Var}(X)}{c^2}.\qedhere\]
\end{proof}
\begin{homework}
Show that for any Borel-measurable, non-decreasing $f:[0,\infty)\to[0,\infty)$ and any non-negative random variable $X$, we have, for all $c$,
\[P(X\geq c)\leq\frac{E(f(X))}{f(c)}.\]
\end{homework}
\begin{definition}
We say that events $A,B\in\cal{F}$ are independent if $P(A\cap B)=P(A)\cdot P(B)$. More generally, the collection of events $\{A_\alpha\}_{\alpha\in I}$ is independent if
\[P(A_{\alpha_1}\cap\cdots\cap A_{\alpha_n})=P(A_{\alpha_1})\cdots P(A_{\alpha_n})\]
for all finite subsets $\{\alpha_1,\ldots,\alpha_n\}\in I$. Note that this is \textbf{not} the same as the $A_\alpha$'s being pairwise independent.
\end{definition}
\begin{example}
Suppose we are rolling a die twice. Let
\begin{align*}
A_1&=\{\text{sum of the rolls is }7\},\\
A_2&=\{\text{first roll is }1\},\\
A_3&=\{\text{second roll is }6\}.
\end{align*}
It is easy to see that
\[P(A_1)=P(A_2)=P(A_3)=\frac{1}{6}.\]
We have
\[P(A_1\cap A_2)=P(A_1\cap A_3)=P(A_2\cap A_3)=\frac{1}{36},\]
so the events $A_1,A_2,A_3$ are pairwise independent, but
\[P(A_1\cap A_2\cap A_3)=\frac{1}{36}\neq\frac{1}{6}\cdot\frac{1}{6}\cdot\frac{1}{6},\]
so the collection is not independent.
\end{example}

\begin{definition}
If $\cal{F}_\alpha$ for $\alpha\in I$ are $\sigma$-algebras, we say that the $\cal{F}_\alpha$'s are independent if
\[P(A_{\alpha_1}\cap\cdots\cap A_{\alpha_n})=P(A_{\alpha_1})\cdots P(A_{\alpha_n})\]
for all $A_{\alpha_1}\in\cal{F}_{\alpha_1},\ldots,A_{\alpha_n}\in\cal{F}_{\alpha_n}$.
\end{definition}

To any random variable $X$, there is a corresponding $\sigma$-algebra $\cal{F}_X=\{X^{-1}(B)\mid \text{Borel }B\subseteq\R\}$, which is the smallest $\sigma$-algebra on which $X$ is measurable.