Section 2.3 — Inventory of discrete distributions¶
This notebook contains all the code examples from Section 2.3 Inventory of discrete distributions of the No Bullshit Guide to Statistics.
Notebook setup¶
In [1]:
Copied!
# load Python modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# load Python modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
In [2]:
Copied!
# Figures setup
sns.set_theme(
context="paper",
style="whitegrid",
palette="colorblind",
rc={'figure.figsize': (7,5)},
)
%config InlineBackend.figure_format = 'retina'
# Figures setup
sns.set_theme(
context="paper",
style="whitegrid",
palette="colorblind",
rc={'figure.figsize': (7,5)},
)
%config InlineBackend.figure_format = 'retina'
In [3]:
Copied!
# set random seed for repeatability
np.random.seed(42)
# set random seed for repeatability
np.random.seed(42)
In [1]:
Copied!
%pip install ministats
%pip install ministats
Requirement already satisfied: ministats in /Users/ivan/Projects/Minireference/software/ministats (0.2.0) Requirement already satisfied: matplotlib>=3.8.3 in /Users/ivan/Projects/Minireference/STATSbook/noBSstatsnotebooks/venv/lib/python3.9/site-packages (from ministats) (3.8.3) Requirement already satisfied: numpy>=1.26.4 in /Users/ivan/Projects/Minireference/STATSbook/noBSstatsnotebooks/venv/lib/python3.9/site-packages (from ministats) (1.26.4) Requirement already satisfied: scipy>=1.12.0 in /Users/ivan/Projects/Minireference/STATSbook/noBSstatsnotebooks/venv/lib/python3.9/site-packages (from ministats) (1.12.0) Requirement already satisfied: pandas>=2.2.1 in /Users/ivan/Projects/Minireference/STATSbook/noBSstatsnotebooks/venv/lib/python3.9/site-packages (from ministats) (2.2.1) Requirement already satisfied: pillow>=10.2.0 in /Users/ivan/Projects/Minireference/STATSbook/noBSstatsnotebooks/venv/lib/python3.9/site-packages (from ministats) (10.2.0) Requirement already satisfied: seaborn>=0.13.2 in /Users/ivan/Projects/Minireference/STATSbook/noBSstatsnotebooks/venv/lib/python3.9/site-packages (from ministats) (0.13.2) Requirement already satisfied: statsmodels>=0.14.1 in /Users/ivan/Projects/Minireference/STATSbook/noBSstatsnotebooks/venv/lib/python3.9/site-packages (from ministats) (0.14.1) Requirement already satisfied: kiwisolver>=1.3.1 in /Users/ivan/Projects/Minireference/STATSbook/noBSstatsnotebooks/venv/lib/python3.9/site-packages (from matplotlib>=3.8.3->ministats) (1.4.4) Requirement already satisfied: cycler>=0.10 in /Users/ivan/Projects/Minireference/STATSbook/noBSstatsnotebooks/venv/lib/python3.9/site-packages (from matplotlib>=3.8.3->ministats) (0.11.0) Requirement already satisfied: importlib-resources>=3.2.0 in /Users/ivan/Projects/Minireference/STATSbook/noBSstatsnotebooks/venv/lib/python3.9/site-packages (from matplotlib>=3.8.3->ministats) (5.12.0) Requirement already satisfied: pyparsing>=2.3.1 in /Users/ivan/Projects/Minireference/STATSbook/noBSstatsnotebooks/venv/lib/python3.9/site-packages (from matplotlib>=3.8.3->ministats) (3.0.9) Requirement already satisfied: fonttools>=4.22.0 in /Users/ivan/Projects/Minireference/STATSbook/noBSstatsnotebooks/venv/lib/python3.9/site-packages (from matplotlib>=3.8.3->ministats) (4.38.0) Requirement already satisfied: contourpy>=1.0.1 in /Users/ivan/Projects/Minireference/STATSbook/noBSstatsnotebooks/venv/lib/python3.9/site-packages (from matplotlib>=3.8.3->ministats) (1.0.7) Requirement already satisfied: python-dateutil>=2.7 in /Users/ivan/Projects/Minireference/STATSbook/noBSstatsnotebooks/venv/lib/python3.9/site-packages (from matplotlib>=3.8.3->ministats) (2.8.2) Requirement already satisfied: packaging>=20.0 in /Users/ivan/Projects/Minireference/STATSbook/noBSstatsnotebooks/venv/lib/python3.9/site-packages (from matplotlib>=3.8.3->ministats) (23.0) Requirement already satisfied: pytz>=2020.1 in /Users/ivan/Projects/Minireference/STATSbook/noBSstatsnotebooks/venv/lib/python3.9/site-packages (from pandas>=2.2.1->ministats) (2022.7.1) Requirement already satisfied: tzdata>=2022.7 in /Users/ivan/Projects/Minireference/STATSbook/noBSstatsnotebooks/venv/lib/python3.9/site-packages (from pandas>=2.2.1->ministats) (2024.1) Requirement already satisfied: patsy>=0.5.4 in /Users/ivan/Projects/Minireference/STATSbook/noBSstatsnotebooks/venv/lib/python3.9/site-packages (from statsmodels>=0.14.1->ministats) (0.5.6) Requirement already satisfied: zipp>=3.1.0 in /Users/ivan/Projects/Minireference/STATSbook/noBSstatsnotebooks/venv/lib/python3.9/site-packages (from importlib-resources>=3.2.0->matplotlib>=3.8.3->ministats) (3.13.0) Requirement already satisfied: six in /Users/ivan/Projects/Minireference/STATSbook/noBSstatsnotebooks/venv/lib/python3.9/site-packages (from patsy>=0.5.4->statsmodels>=0.14.1->ministats) (1.16.0) [notice] A new release of pip is available: 23.0 -> 24.0 [notice] To update, run: pip3.9 install --upgrade pip Note: you may need to restart the kernel to use updated packages.
In [2]:
Copied!
from ministats import plot_pmf
from ministats import plot_cdf
from ministats import plot_pmf
from ministats import plot_cdf
In [ ]:
Copied!
Definitions¶
Math prerequisites¶
Factorial¶
In [5]:
Copied!
from scipy.special import factorial
# ALT.
# from math import factorial
from scipy.special import factorial
# ALT.
# from math import factorial
In [6]:
Copied!
factorial(4)
factorial(4)
Out[6]:
24.0
In [7]:
Copied!
factorial(1), factorial(2), factorial(3)
factorial(1), factorial(2), factorial(3)
Out[7]:
(1.0, 2.0, 6.0)
In [8]:
Copied!
[factorial(k) for k in [5,6,7,8,9,10,11,12]]
[factorial(k) for k in [5,6,7,8,9,10,11,12]]
Out[8]:
[120.0, 720.0, 5040.0, 40320.0, 362880.0, 3628800.0, 39916800.0, 479001600.0]
In [9]:
Copied!
factorial(15)
factorial(15)
Out[9]:
1307674368000.0
In [10]:
Copied!
import numpy as np
np.log(factorial(15))/np.log(10)
import numpy as np
np.log(factorial(15))/np.log(10)
Out[10]:
12.116499611123398
Permutations¶
In [11]:
Copied!
from scipy.special import perm
perm(5,2)
from scipy.special import perm
perm(5,2)
Out[11]:
20.0
In [12]:
Copied!
perm(5,1), perm(5,2), perm(5,3), perm(5,4), perm(5,5)
perm(5,1), perm(5,2), perm(5,3), perm(5,4), perm(5,5)
Out[12]:
(5.0, 20.0, 60.0, 120.0, 120.0)
If you want to actually see, all the possible permutations we
use the permutations function from itertools
In [13]:
Copied!
from itertools import permutations
n = 5
nitems = range(1,n+1)
k = 2
list(permutations(nitems, k))
from itertools import permutations
n = 5
nitems = range(1,n+1)
k = 2
list(permutations(nitems, k))
Out[13]:
[(1, 2), (1, 3), (1, 4), (1, 5), (2, 1), (2, 3), (2, 4), (2, 5), (3, 1), (3, 2), (3, 4), (3, 5), (4, 1), (4, 2), (4, 3), (4, 5), (5, 1), (5, 2), (5, 3), (5, 4)]
In [ ]:
Copied!
Combinations¶
In [14]:
Copied!
from scipy.special import comb
comb(5,2)
from scipy.special import comb
comb(5,2)
Out[14]:
10.0
In [15]:
Copied!
from itertools import combinations
n = 5
k = 2
list(combinations(range(1,n+1), k))
from itertools import combinations
n = 5
k = 2
list(combinations(range(1,n+1), k))
Out[15]:
[(1, 2), (1, 3), (1, 4), (1, 5), (2, 3), (2, 4), (2, 5), (3, 4), (3, 5), (4, 5)]
In [ ]:
Copied!
In [ ]:
Copied!
In [ ]:
Copied!
Summations using basic Python¶
In [16]:
Copied!
N = 10
nums = range(1,N+1)
list(nums)
N = 10
nums = range(1,N+1)
list(nums)
Out[16]:
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
In [17]:
Copied!
sum(nums)
sum(nums)
Out[17]:
55
In [18]:
Copied!
N*(N+1)/2 # formula
N*(N+1)/2 # formula
Out[18]:
55.0
In [19]:
Copied!
squarednums = [num**2 for num in nums]
squarednums
squarednums = [num**2 for num in nums]
squarednums
Out[19]:
[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]
In [20]:
Copied!
sum(squarednums)
sum(squarednums)
Out[20]:
385
In [21]:
Copied!
N*(N+1)*(2*N+1)/6 # formula
N*(N+1)*(2*N+1)/6 # formula
Out[21]:
385.0
In [ ]:
Copied!
In [22]:
Copied!
cubednums = [num**3 for num in nums]
cubednums
cubednums = [num**3 for num in nums]
cubednums
Out[22]:
[1, 8, 27, 64, 125, 216, 343, 512, 729, 1000]
In [23]:
Copied!
sum(cubednums)
sum(cubednums)
Out[23]:
3025
In [24]:
Copied!
( N*(N+1)/2 )**2 # formula
( N*(N+1)/2 )**2 # formula
Out[24]:
3025.0
In [ ]:
Copied!
Sum of the geometric series¶
In [25]:
Copied!
a = 0.5
r = 0.5
sum([a*r**k for k in range(0,60)])
a = 0.5
r = 0.5
sum([a*r**k for k in range(0,60)])
Out[25]:
1.0
In [ ]:
Copied!
Summations using SymPy (bonus material)¶
In [26]:
Copied!
from sympy import symbols
from sympy import summation
from sympy import simplify
k, N, r = symbols("k N r")
from sympy import symbols
from sympy import summation
from sympy import simplify
k, N, r = symbols("k N r")
In [ ]:
Copied!
Sum of arithmetic sequence¶
In [27]:
Copied!
a_k = k
summation(a_k, (k,0, N))
a_k = k
summation(a_k, (k,0, N))
Out[27]:
$\displaystyle \frac{N^{2}}{2} + \frac{N}{2}$
In [28]:
Copied!
simplify(summation(a_k, (k,0, N)))
simplify(summation(a_k, (k,0, N)))
Out[28]:
$\displaystyle \frac{N \left(N + 1\right)}{2}$
In [ ]:
Copied!
Sum of squares¶
In [29]:
Copied!
b_k = k**2
simplify(summation(b_k, (k,0, N)))
b_k = k**2
simplify(summation(b_k, (k,0, N)))
Out[29]:
$\displaystyle \frac{N \left(2 N^{2} + 3 N + 1\right)}{6}$
In [ ]:
Copied!
Sum of cubes¶
In [30]:
Copied!
c_k = k**3
simplify(summation(c_k, (k,0, N)))
c_k = k**3
simplify(summation(c_k, (k,0, N)))
Out[30]:
$\displaystyle \frac{N^{2} \left(N^{2} + 2 N + 1\right)}{4}$
In [ ]:
Copied!
Geometric series¶
In [31]:
Copied!
g_k = r**k
summation(g_k, (k,0, N))
g_k = r**k
summation(g_k, (k,0, N))
Out[31]:
$\displaystyle \begin{cases} N + 1 & \text{for}\: r = 1 \\\frac{1 - r^{N + 1}}{1 - r} & \text{otherwise} \end{cases}$
In [32]:
Copied!
# from sympy import limit, oo
# limit( (1-r**(N+1))/(1-r), N, oo)
# # doesn't work; need to specify assumption r < 1
# from sympy import limit, oo
# limit( (1-r**(N+1))/(1-r), N, oo)
# # doesn't work; need to specify assumption r < 1
In [ ]:
Copied!
In [ ]:
Copied!
In [ ]:
Copied!
In [ ]:
Copied!
In [ ]:
Copied!
Discrete distributions reference¶
Discrete uniform¶
In [33]:
Copied!
# import the model family
from scipy.stats import randint
# choose parameters
alpha = 1 # start at
beta = 4 # stop at
# create the rv object
rvU = randint(alpha, beta+1)
# use one of the rv object's methods
# import the model family
from scipy.stats import randint
# choose parameters
alpha = 1 # start at
beta = 4 # stop at
# create the rv object
rvU = randint(alpha, beta+1)
# use one of the rv object's methods
The limits of the sample space of the random variable rvU
can be obtained by calling its .support()
method.
In [34]:
Copied!
rvU.support()
rvU.support()
Out[34]:
(1, 4)
In [35]:
Copied!
rvU.mean()
rvU.mean()
Out[35]:
2.5
In [36]:
Copied!
rvU.var()
rvU.var()
Out[36]:
1.25
In [37]:
Copied!
rvU.std() # = np.sqrt(rvU.var())
rvU.std() # = np.sqrt(rvU.var())
Out[37]:
1.118033988749895
In [ ]:
Copied!
Probability mass function¶
In [38]:
Copied!
for x in range(1,4+1):
print("f_U(",x,") = ", rvU.pmf(x))
for x in range(1,4+1):
print("f_U(",x,") = ", rvU.pmf(x))
f_U( 1 ) = 0.25 f_U( 2 ) = 0.25 f_U( 3 ) = 0.25 f_U( 4 ) = 0.25
To create a stem-plot of the probability mass function $f_U$, we can use the following three-step procedure:
- Create a range of inputs
xs
for the plot. - Compute the value of $f_U =$
rvU
for each of the inputs and store the results as list of valuesfUs
. - Plot the values
fUs
by calling the functionplt.stem(xs,fUs)
.
In [39]:
Copied!
import numpy as np
import matplotlib.pyplot as plt
xs = np.arange(0,8+1)
fUs = rvU.pmf(xs)
plt.stem(xs, fUs, basefmt=" ")
import numpy as np
import matplotlib.pyplot as plt
xs = np.arange(0,8+1)
fUs = rvU.pmf(xs)
plt.stem(xs, fUs, basefmt=" ")
Out[39]:
<StemContainer object of 3 artists>
In [40]:
Copied!
# ALT
plot_pmf(rvU, xlims=[0,8+1])
# ALT
plot_pmf(rvU, xlims=[0,8+1])
Out[40]:
<AxesSubplot:xlabel='x', ylabel='$f_{X}$'>
In [ ]:
Copied!
Cumulative distribution function¶
In [41]:
Copied!
for b in range(1,4+1):
print("F_U(", b, ") = ", rvU.cdf(b))
for b in range(1,4+1):
print("F_U(", b, ") = ", rvU.cdf(b))
F_U( 1 ) = 0.25 F_U( 2 ) = 0.5 F_U( 3 ) = 0.75 F_U( 4 ) = 1.0
In [42]:
Copied!
import numpy as np
import seaborn as sns
xs = np.linspace(0,8,1000)
FUs = rvU.cdf(xs)
sns.lineplot(x=xs, y=FUs)
import numpy as np
import seaborn as sns
xs = np.linspace(0,8,1000)
FUs = rvU.cdf(xs)
sns.lineplot(x=xs, y=FUs)
Out[42]:
<AxesSubplot:>
In [43]:
Copied!
# ALT
plot_cdf(rvU, xlims=[0,8], rv_name="U")
# ALT
plot_cdf(rvU, xlims=[0,8], rv_name="U")
Out[43]:
<AxesSubplot:xlabel='u', ylabel='$F_{U}$'>
Let's generate 10 random observations from random variable rvU
:
In [44]:
Copied!
rvU.rvs(10)
rvU.rvs(10)
Out[44]:
array([3, 4, 1, 3, 3, 4, 1, 1, 3, 2])
In [ ]:
Copied!
In [ ]:
Copied!
Bernoulli¶
In [45]:
Copied!
from scipy.stats import bernoulli
rvB = bernoulli(p=0.3)
from scipy.stats import bernoulli
rvB = bernoulli(p=0.3)
In [46]:
Copied!
rvB.support()
rvB.support()
Out[46]:
(0, 1)
In [47]:
Copied!
rvB.mean(), rvB.var()
rvB.mean(), rvB.var()
Out[47]:
(0.3, 0.21)
In [48]:
Copied!
rvB.rvs(10)
rvB.rvs(10)
Out[48]:
array([0, 0, 1, 0, 1, 0, 1, 1, 0, 0])
In [49]:
Copied!
plot_pmf(rvB, xlims=[0,5]);
plot_pmf(rvB, xlims=[0,5]);
In [ ]:
Copied!
In [ ]:
Copied!
Poisson¶
In [50]:
Copied!
from scipy.stats import poisson
lam = 10
rvP = poisson(lam)
from scipy.stats import poisson
lam = 10
rvP = poisson(lam)
In [51]:
Copied!
rvP.pmf(8)
rvP.pmf(8)
Out[51]:
0.11259903214902009
In [52]:
Copied!
rvP.cdf(8)
rvP.cdf(8)
Out[52]:
0.3328196787507191
In [53]:
Copied!
## ALT. way to compute the value F_P(8) =
# sum([rvP.pmf(x) for x in range(0,8+1)])
## ALT. way to compute the value F_P(8) =
# sum([rvP.pmf(x) for x in range(0,8+1)])
In [54]:
Copied!
plot_pmf(rvP, xlims=[0,30]);
plot_pmf(rvP, xlims=[0,30]);
In [ ]:
Copied!
In [ ]:
Copied!
Binomial¶
We'll use the name rvX
because rvB
was already used for the Bernoulli random variable above.
In [55]:
Copied!
from scipy.stats import binom
n = 20
p = 0.14
rvX = binom(n,p)
from scipy.stats import binom
n = 20
p = 0.14
rvX = binom(n,p)
In [56]:
Copied!
rvX.support()
rvX.support()
Out[56]:
(0, 20)
In [57]:
Copied!
rvX.mean(), rvX.var()
rvX.mean(), rvX.var()
Out[57]:
(2.8000000000000003, 2.4080000000000004)
In [58]:
Copied!
plot_pmf(rvX, xlims=[0,30]);
plot_pmf(rvX, xlims=[0,30]);
In [ ]:
Copied!
In [ ]:
Copied!
Geometric¶
In [59]:
Copied!
from scipy.stats import geom
rvG = geom(p = 0.2)
from scipy.stats import geom
rvG = geom(p = 0.2)
In [60]:
Copied!
rvG.support()
rvG.support()
Out[60]:
(1, inf)
In [61]:
Copied!
rvG.mean(), rvG.var()
rvG.mean(), rvG.var()
Out[61]:
(5.0, 20.0)
In [62]:
Copied!
plot_pmf(rvG, xlims=[0,40]);
plot_pmf(rvG, xlims=[0,40]);
In [ ]:
Copied!
In [ ]:
Copied!
Negative binomial¶
In [63]:
Copied!
from scipy.stats import nbinom
r = 10
p = 0.5
rvN = nbinom(r,p)
from scipy.stats import nbinom
r = 10
p = 0.5
rvN = nbinom(r,p)
In [64]:
Copied!
rvN.support()
rvN.support()
Out[64]:
(0, inf)
In [65]:
Copied!
rvN.mean(), rvN.var()
rvN.mean(), rvN.var()
Out[65]:
(10.0, 20.0)
In [66]:
Copied!
plot_pmf(rvN, xlims=[0,40]);
plot_pmf(rvN, xlims=[0,40]);
In [ ]:
Copied!
In [ ]:
Copied!
Hypergeometric¶
In [67]:
Copied!
from scipy.stats import hypergeom
a = 30 # number of success balls
b = 40 # number of failure balls
n = 20 # how many we're drawing
rvH = hypergeom(a+b, a, n)
from scipy.stats import hypergeom
a = 30 # number of success balls
b = 40 # number of failure balls
n = 20 # how many we're drawing
rvH = hypergeom(a+b, a, n)
In [68]:
Copied!
rvH.support()
rvH.support()
Out[68]:
(0, 20)
In [69]:
Copied!
rvH.mean(), rvH.var()
rvH.mean(), rvH.var()
Out[69]:
(8.571428571428571, 3.54924578527063)
In [70]:
Copied!
meanH, stdH = rvH.stats()
print("mean =", meanH, " std =", stdH)
meanH, stdH = rvH.stats()
print("mean =", meanH, " std =", stdH)
mean = 8.571428571428571 std = 3.54924578527063
In [71]:
Copied!
plot_pmf(rvH, xlims=[0,30]);
plot_pmf(rvH, xlims=[0,30]);
In [ ]:
Copied!
Tomatoes salad probabilities¶
In [72]:
Copied!
a = 3 # number of good tomatoes
b = 4 # number of rotten tomatoes
n = 2 # how many we're drawing
rvHe = hypergeom(a+b, a, n)
plot_pmf(rvHe, xlims=[0,3])
rvHe.pmf(0), rvHe.pmf(1), rvHe.pmf(2)
a = 3 # number of good tomatoes
b = 4 # number of rotten tomatoes
n = 2 # how many we're drawing
rvHe = hypergeom(a+b, a, n)
plot_pmf(rvHe, xlims=[0,3])
rvHe.pmf(0), rvHe.pmf(1), rvHe.pmf(2)
Out[72]:
(0.28571428571428575, 0.5714285714285715, 0.14285714285714288)
Number of dogs seen by Amy¶
In [73]:
Copied!
a = 7 # number dogs
b = 20 - 7 # number of other animals
n = 12 # how many "patients" Amy will see today
rvD = hypergeom(a+b, a, n)
a = 7 # number dogs
b = 20 - 7 # number of other animals
n = 12 # how many "patients" Amy will see today
rvD = hypergeom(a+b, a, n)
In [74]:
Copied!
# Pr of exactly five dogs
rvD.pmf(5)
# Pr of exactly five dogs
rvD.pmf(5)
Out[74]:
0.2860681114551084
In [75]:
Copied!
plot_pmf(rvD, xlims=[0,10]);
plot_pmf(rvD, xlims=[0,10]);
In [ ]:
Copied!
In [76]:
Copied!
from scipy.stats import multinomial
n = 10
ps = [0.1, 0.5, 0.8]
rvM = multinomial(n,ps)
from scipy.stats import multinomial
n = 10
ps = [0.1, 0.5, 0.8]
rvM = multinomial(n,ps)
In [77]:
Copied!
rvM.rvs()
rvM.rvs()
Out[77]:
array([[0, 6, 4]])
In [78]:
Copied!
# TODO: 3D scatter plot of points in space
# TODO: 3D scatter plot of points in space
In [ ]:
Copied!
Modelling real-world data using probability¶
TODO: add simple inference and plots
In [ ]:
Copied!
Discussion¶
In [ ]:
Copied!
In [ ]:
Copied!