7 Random Variable Distributions R Examples

Author

Brian Powers

Published

September 17, 2024

7.1 Bernoulli

Like the flip of a fair or unfair coin. X=1 if the coin comes up heads, 0 on tails.

#Parameters
p <- .3 # the probability of a success

#Generating 10 random values
rbinom(10, size=1, prob=p)

 [1] 0 1 0 1 1 1 1 0 1 1

Expected value is \[\sum_k k\cdot Pr[X=k]\]

#All possible values
k <- 0:1
#Associated probabilities
Pr <- dbinom(k, size=1, prob=p)
#expectation
sum(k*Pr)

[1] 0.3

The expected value is \(p\) and the variance is \(p(1-p)\)

bernoulli.sim <- rbinom(100, size=1, prob=p)

# Expectation
p

[1] 0.3

# from sample
mean(bernoulli.sim)

[1] 0.25

# Variance
p*(1-p)

[1] 0.21

#from sample
var(bernoulli.sim)

[1] 0.1893939

7.2 Binomial

Like flipping a fair (or unfair) coin \(n\) times, counting the number of heads.

#Parameters
n <- 8  #the number of flips / attempts
p <- .4 #the probability of success

#Generate 10 random values
rbinom(10, size=n, prob=p)

 [1] 2 4 1 5 3 3 6 3 3 1

#Calculate P(X=3)
dbinom(x=3, size=n, prob=p)

[1] 0.2786918

choose(n, 3)*p^3*(1-p)^(n-3)

[1] 0.2786918

#Calculate all probabilities
dbinom(x=0:8, size=n, prob=p)

[1] 0.01679616 0.08957952 0.20901888 0.27869184 0.23224320 0.12386304 0.04128768
[8] 0.00786432 0.00065536

sum(dbinom(x=0:8, size=n, prob=p))

[1] 1

#Calculate P(X <= 3)
pbinom(q=3, size=n, prob=p)

[1] 0.5940864

Visualize the binomial probability mass function:

k <- 0:8
pk <- dbinom(x=0:8, size=n, prob=p)
barplot(height=pk, names=k, main=paste0("pmf of Binom(",n,",",p,")"))

Expected value is \[\sum_k k\cdot Pr[X=k]\]

#All possible values
k <- 0:n
#Associated probabilities
Pr <- dbinom(k, size=n, prob=p)
#expectation
sum(k*Pr)

[1] 3.2

n*p

[1] 3.2

The probability mass function

barplot(Pr, names=k, main="Probability Mass Function of Binomial(8,.4)")

The cumulative distribution function

x <- 0:8
cdfx <- pbinom(x, size=n,prob=.4)
plot(x, cdfx, type="s", main="Cumulative Distribution Function of Binomial(8,.4)", ylim=c(0,1))

The expected value is \(np\) and the variance is \(np(1-p)\)

binomial.sim <- rbinom(10000, size=n, prob=p)

# Expectation
n*p

[1] 3.2

# from sample
mean(binomial.sim)

[1] 3.2267

# Variance
n*p*(1-p)

[1] 1.92

#from sample
var(binomial.sim)

[1] 1.9307

7.3 Geometric

Counting how many tails before the first head; the number of failures before the first success (independent trials, probability of success remains constant)

#Parameters
p <- .4 #The probability of success on each trial

#Generate 10 random values
rgeom(10, prob=p)

 [1] 1 3 0 1 2 0 0 0 2 0

The probability mass function…. only going out to 20, but the support is infinite!

k <- 0:20
Pr <- dgeom(k, prob=p)
barplot(Pr, names=k, main="Probability Mass Function of Geom(.4)")

The cumulative distribution function

cdfx <- cumsum(Pr)

# if you want it to look really proper
n <- length(k)
plot(x = NA, y = NA, pch = NA, 
     xlim = c(0, max(k)), 
     ylim = c(0, 1),
     ylab = "Cumulative Probability",
     main = "Cumulative Distribution Function of Geom(.4)")
points(x = k[-n], y = cdfx[-n], pch=19)
points(x = k[-1], y = cdfx[-n], pch=1)
for(i in 1:(n-1)) points(x=k[i+0:1], y=cdfx[c(i,i)], type="l")

To calculate probabilities from a geometric rv, use the pgeom function

#Pr(X <= 3)
pgeom(3, prob=p)

[1] 0.8704

To get individual probabilities at k

#Pr(X=3)
dgeom(3, prob=p)

[1] 0.0864

But it’s probably faster to just use a step type plot. The vertical lines are not technically part of the plot though.

plot(k, cdfx, type="s", main="Cumulative Distribution Function of Geom(.4)", ylim=c(0,1))
points(k, cdfx, pch = 16, col = "blue")

You can get the cumulative probabilities from the pgeom function

plot(k, pgeom(k, prob=.4), type="s", main="Cumulative Distribution Function of Geom(.4)", ylim=c(0,1))
points(k, cdfx, pch = 16, col = "blue")

The expected value is \(\dfrac{1-p}{p}\) and the variance is \(\dfrac{1-p}{p^2}\)

geom.sim <- rgeom(10000, prob=p)

# Expectation
(1-p)/p

[1] 1.5

# from sample
mean(geom.sim)

[1] 1.5067

# Variance
(1-p)/(p^2)

[1] 3.75

#from sample
var(geom.sim)

[1] 3.776933

7.4 Poisson

Like the number of times something occurs during a fixed time window

#Parameters
l <- 10.5 #The rate parameter, average occurrences per unit time
     #It is lambda, but I'll call it "l"

#Generate 10 random values
rpois(10, lambda=l)

 [1]  8  7 13 15  5 12 13  8  8 14

PMF and CDF

par(mfrow=c(1,2))
k <- 0:20
Pr <- dpois(k,l)

barplot(Pr, names=k, main="PMF of Pois(3)")
plot(k, ppois(k, l), type="s", main="CDF of Pois(3)", ylim=c(0,1))
points(k, ppois(k, l), pch = 16, col = "blue")

The expected value and variance are both is \(\lambda\)

pois.sim <- rpois(10000, lambda=l)

# Expectation & Variance
l

[1] 10.5

# mean from sample
mean(pois.sim)

[1] 10.4707

#variance from sample
var(pois.sim)

[1] 10.5538

7.5 Uniform (Discrete)

Like rolling a fair die

#Parameters
a <- 1 #lower bound, inclusive
b <- 6 #upper bound, inclusive

#Generate 10 random values
sample(a:b, 10, replace=TRUE) #replace=TRUE is important

 [1] 4 3 2 1 6 4 1 4 4 3

The PMF and CDF

par(mfrow=c(1,2))
k <- a:b
Pr <- rep(1/(b-a+1), length(k))

barplot(Pr, names=k, main="PMF of Unif(1,6)")
plot(k, cumsum(Pr), type="s", main="CDF of Unif(1,6)", ylim=c(0,1))
points(k, cumsum(Pr), pch = 16, col = "blue")

7.6 Continuous Uniform

The backbone of random variable generation - for example a random decimal between 0 and 1

# Parameters
a <- 0 #lower bound
b <- 1 #upper bound

#generate 10 random values
runif(10, min=a, max=b)

 [1] 0.4265918 0.7514635 0.4387945 0.6073087 0.5939819 0.6572275 0.7774568
 [8] 0.7139346 0.5636088 0.6903111

Probability density function and cumulative distribution function

par(mfrow=c(1,2))
x <- seq(a,b, length.out=100)

plot(x, dunif(x, a, b), type="l", main="PDF of Unif(0,1)", ylab="density", ylim=c(0,1))
plot(x, punif(x, a, b), type="l", main="CDF of Unif(0,1)", ylab="F(x)")

The expected value is \(\frac{a+b}{2}\) and the variance is \(\frac{(b-a)^2}{12}\)

unif.sim <- runif(10000, min=a, max=b)

# Expectation
(a+b)/2

[1] 0.5

# mean from sample
mean(unif.sim)

[1] 0.4973751

#variance
(b-a)^2/12

[1] 0.08333333

#variance from sample
var(unif.sim)

[1] 0.08360614

7.7 Normal / Gaussian

Many things in the world are normally distributed - useful for modeling when the distribution is symmetric and probability is densest in the middle with decreasing tails.

#Parameters
mu <- 5 #the mean/location of the distribution
sigma2 <- 4 #the variance is in squared units!!
sigma <- sqrt(sigma2) #sigma is the standard deviation, not the variance

#generate 10 random values
rnorm(10, mean=mu, sd=sigma)

 [1] 5.801261 7.958501 3.289157 4.331514 5.160004 4.634221 4.495469 9.406295
 [9] 6.611756 4.606657

Probability density function and cumulative distribution function

par(mfrow=c(1,2))
x <- seq(mu-3*sigma, mu+3*sigma, length.out=100)

plot(x, dnorm(x, mu, sigma), type="l", main="PDF of Normal(5, 2^2)", ylab="density")
plot(x, pnorm(x, mu, sigma), type="l", main="CDF of Normal(5, 2^2)", ylab="F(x)")

Approximate the expected value numerically

\[\int_{-\infty}^\infty t f(t) dt\]

mu <- 15
sigma <- 4
t <- seq(mu-4*sigma, mu+4*sigma, length.out=1000)
d <- dnorm(t, mean=mu, sd=sigma)

w <- t[2]-t[1]

head(t)

[1] -1.0000000 -0.9679680 -0.9359359 -0.9039039 -0.8718719 -0.8398398

head(d)

[1] 3.345756e-05 3.454551e-05 3.566656e-05 3.682162e-05 3.801165e-05
[6] 3.923763e-05

#Expected value
sum( t * (d*w))

[1] 14.99907

The expected value is \(\mu\) and the variance is \(\sigma^2\)

normal.sim <- rnorm(10000, mean=mu, sd=sigma)

# Expectation
mu

[1] 15

# mean from sample
mean(normal.sim)

[1] 15.00065

#variance
sigma2

[1] 4

#variance from sample
var(normal.sim)

[1] 15.73667

7.7.1 Simulate a Normal probability

normal.sim <- rnorm(1000000)

x <- .25 <= normal.sim & normal.sim <=.75

sum(x)/1000000

[1] 0.174158

pnorm(.75)-pnorm(.25)

[1] 0.1746663

7.8 Two normally distributed random variables

# X1 ~ Normal(1,1^2)
x1 <- rnorm(10000, mean=1, sd=1)

#X2 ~ Normal(2, 2^2)
x2 <- rnorm(10000, mean=2, sd=2)

par(mfrow=c(1,2))
hist(x1)
hist(x2)

Let’s check the variances individually

var(x1)

[1] 1.003988

var(x2)

[1] 3.991855

In theory, Var(X1+X2) = Var(X1) + Var(X2) We check with sample variance

var(x1+x2)

[1] 5.022937

var(x1)+var(x2)

[1] 4.995843

var(x1)+var(x2)+2*cov(x1,x2)

[1] 5.022937

Look at the plot of these two variables

mean(x1+x2)

[1] 3.032148

mean(x1)+mean(x2)

[1] 3.032148

plot(x1,x2)

hist(x1+x2, breaks=50)

7.9 Uniform + uniform

X <- runif(100000, 0, 1)
hist(X)

Y <- runif(100000, 0, 1)
hist(Y)

hist(X+Y)

Z <- runif(100000, 0, 1)
hist(X+Y+Z, breaks=100)

7.9.1 Geometric Distribution Limit

Let’s consider a geometric that counts the number of days before a electronic component malfunctions Suppose that every day there’s a 25% chance of malfunction.

p=.25
random.geom <- rgeom(10000, prob=p)
hist(random.geom, breaks=25)

What if more than one malfunction could happen per day? Like we replace the part immediately when it malfunctions. We could divide the day into \(N\) parts and use a geometric for each part of the day. If \(X\)=k, then the proportion of the day it took to break is \(k/N\) Let’s start with \(N\)=2 and go up from there

par (mfrow=c(2,2))
N <- 2
geom.sim <- rgeom(10000, p/N) / N
hist(geom.sim, breaks=100)
mean(geom.sim)

[1] 3.5055

N <- 6
geom.sim <- rgeom(10000, p/N) / N
hist(geom.sim, breaks=100)
mean(geom.sim)

[1] 3.8146

N <- 24
geom.sim <- rgeom(10000, p/N) / N
hist(geom.sim, breaks=100)
mean(geom.sim)

[1] 4.016775

N <- 100
geom.sim <- rgeom(10000, p/N) / N
hist(geom.sim, breaks=100)

mean(geom.sim)

[1] 4.028912

7.10 Exponential

For modeling waiting times - the length of time until an event occurs. It’s a continuous version of a geometric, if you start looking at smaller and smaller time units (e.g. days -> hours -> minutes -> seconds)

#Parameters
l <- 3 #The rate parameter, the average number of occurrences per unit time

#Generate 10 random values
rexp(10, rate=l)

 [1] 1.05014281 0.07847060 0.37205927 0.29786303 0.11721248 0.11242535
 [7] 0.31164338 0.67525800 0.37148634 0.05199775

Probability density function and cumulative distribution function

par(mfrow=c(1,2))
x <- seq(0, 6/l, length.out=100)

plot(x, dexp(x, l), type="l", main="PDF of Exp(3)", ylab="density")
plot(x, pexp(x, l), type="l", main="CDF of Exp(3)", ylab="F(x)")

The expected value is \(\dfrac{1}{\lambda}\) and the variance is \(\frac{1}{\lambda^2}\)

exp.sim <- rexp(10000, l)

# Expectation
1/l

[1] 0.3333333

# mean from sample
mean(exp.sim)

[1] 0.337423

#variance
1/l^2

[1] 0.1111111

#variance from sample
var(exp.sim)

[1] 0.115274

If rate = 1, what is \(Pr(X < 5)\)

pexp(5, rate=1)

[1] 0.9932621

7.10.1 Relationship between an exponential and a Poisson

lambda = 10

simulated.exp <- rexp(10000, rate=10)
hist(simulated.exp)

head(simulated.exp)

[1] 0.007536642 0.061080740 0.008189010 0.001095974 0.079767192 0.547604518

converted.poisson <- as.vector(table(floor(cumsum(simulated.exp))))
par(mfrow=c(1,2))
    hist(converted.poisson)
    hist(rpois(500, lambda=10))

7.11 Simulate stock prices

Here’s an example of how you could combine normally distributed random variables for a model of daily stock prices. This is an example of a random walk.

deltas <- rnorm(30)
X <- 50 + c(0,cumsum(deltas))
plot(x=1:31, y=X, type="l")