import math as m
import altair as alt
import numpy as np
import pandas as pdProbabilty distributions in Python ‘from scratch’
fastai
Probabilty distributions in Python ‘from scratch’
Here I write up some functions to generate probability univariate and normal probability distributions based on the book Data Science from Scratch
Libraries
alt.data_transformers.disable_max_rows()DataTransformerRegistry.enable('default')
Uniform distribution
def uniform_pdf(x: float) -> float:
return 1 if 0 <= x < 1 else 0def uniform_cdf(x: float) -> float:
if x < 0: return 0
elif x < 1 : return x
else: return 1Example values
print("x\tpdf\tcdf\n")
for x in [-2, 0, .2, .8, 1, 1.5]:
print(f"{x}\t{uniform_pdf(x)}\t{uniform_cdf(x)}")x pdf cdf
-2 0 0
0 1 0
0.2 1 0.2
0.8 1 0.8
1 0 1
1.5 0 1
For plotting we generate both cdf and pdf values in a tidy format.
x = pd.Series(np.linspace(-1, 2, 1000))
uniform = pd.DataFrame(
{
'x': x,
'pdf': x.apply(uniform_pdf),
'cdf': x.apply(uniform_cdf)
}
).melt(id_vars='x')
uniform| x | variable | value | |
|---|---|---|---|
| 0 | -1.000000 | 0.0 | |
| 1 | -0.996997 | 0.0 | |
| 2 | -0.993994 | 0.0 | |
| 3 | -0.990991 | 0.0 | |
| 4 | -0.987988 | 0.0 | |
| ... | ... | ... | ... |
| 1995 | 1.987988 | cdf | 1.0 |
| 1996 | 1.990991 | cdf | 1.0 |
| 1997 | 1.993994 | cdf | 1.0 |
| 1998 | 1.996997 | cdf | 1.0 |
| 1999 | 2.000000 | cdf | 1.0 |
2000 rows × 3 columns
uniform.groupby('variable').describe().loc[:, ('value', slice(None))].T| variable | cdf | ||
|---|---|---|---|
| value | count | 1000.000000 | 1000.000000 |
| mean | 0.500000 | 0.333000 | |
| std | 0.441243 | 0.471522 | |
| min | 0.000000 | 0.000000 | |
| 25% | 0.000000 | 0.000000 | |
| 50% | 0.500000 | 0.000000 | |
| 75% | 1.000000 | 1.000000 | |
| max | 1.000000 | 1.000000 |
chart = alt.Chart().mark_line().encode(
alt.X('x:Q'), alt.Y('value:Q'), alt.Color('variable:N'),
)
label = alt.selection_single(
encodings=['x'], on='mouseover', nearest=True, empty='none'
)
alt.layer(
chart,
chart.mark_circle().encode(opacity=alt.condition(label, alt.value(1), alt.value(0))).add_selection(label),
alt.Chart().mark_rule(color='darkgray').encode(alt.X('x:Q')).transform_filter(label),
chart.mark_text(align='left', dx=5, dy=-5, strokeWidth=0.5).encode(
text=alt.Text('value:Q', format=',.4f')
).transform_filter(label),
data=uniform
).properties(width=600, title='Uniform PDF and CDF')Normal PDF
def calc_normal_pdf(x: float, mu: float = 0, sigma: float=1) -> float:
return m.exp(-(x-mu)**2 / (2 * sigma **2)) * 1/(m.sqrt(2 * m.pi) * sigma)x = pd.Series(np.linspace(-5, 5, 1000))
normal_pdf = pd.DataFrame(
{
'x': x,
'mu=0, sigma=1': x.apply(calc_normal_pdf),
'mu=0, sigma=2': x.apply(lambda x: calc_normal_pdf(x, 0, 2)),
'mu=0, sigma=3': x.apply(lambda x: calc_normal_pdf(x, 0, 3))
}
).melt(id_vars='x')
normal_pdf| x | variable | value | |
|---|---|---|---|
| 0 | -5.00000 | mu=0, sigma=1 | 0.000001 |
| 1 | -4.98999 | mu=0, sigma=1 | 0.000002 |
| 2 | -4.97998 | mu=0, sigma=1 | 0.000002 |
| 3 | -4.96997 | mu=0, sigma=1 | 0.000002 |
| 4 | -4.95996 | mu=0, sigma=1 | 0.000002 |
| ... | ... | ... | ... |
| 2995 | 4.95996 | mu=0, sigma=3 | 0.033902 |
| 2996 | 4.96997 | mu=0, sigma=3 | 0.033715 |
| 2997 | 4.97998 | mu=0, sigma=3 | 0.033529 |
| 2998 | 4.98999 | mu=0, sigma=3 | 0.033344 |
| 2999 | 5.00000 | mu=0, sigma=3 | 0.033159 |
3000 rows × 3 columns
normal_pdf.groupby('variable').describe()['value'].T| variable | mu=0, sigma=1 | mu=0, sigma=2 | mu=0, sigma=3 |
|---|---|---|---|
| count | 1000.000000 | 1000.000000 | 1000.000000 |
| mean | 0.099900 | 0.098668 | 0.090385 |
| std | 0.134980 | 0.065984 | 0.032457 |
| min | 0.000001 | 0.008764 | 0.033159 |
| 25% | 0.000351 | 0.034353 | 0.060851 |
| 50% | 0.017420 | 0.091182 | 0.093905 |
| 75% | 0.181794 | 0.163888 | 0.121860 |
| max | 0.398937 | 0.199471 | 0.132981 |
label = alt.selection_single(
encodings=['x'], on='mouseover', nearest=True, empty='none'
)
chart = alt.Chart().mark_line().encode(
alt.X('x:Q'), alt.Y('value:Q'), alt.Color('variable:N')
)
alt.layer(
chart,
chart.mark_circle().encode(opacity=alt.condition(label, alt.value(1), alt.value(0))).add_selection(label),
alt.Chart().mark_rule(color='darkgray').encode(alt.X('x:Q')).transform_filter(label),
chart.mark_text(align='left', dx=5, dy=-5).encode(text=alt.Text('value:Q', format=',.6f')).transform_filter(label),
# tooltip=alt.Tooltip('value:Q'),
data=normal_pdf
).properties(width=600, title="Normal PDF")Normal CDF
Finally, we also calculate and plot the CDF or normal distribution.
def calc_normal_cdf(x: float, mu: float = 0, sigma: float = 1) -> float:
return (1 + m.erf((x - mu) / m.sqrt(2) / sigma)) /2normal_cdf = pd.DataFrame(
{
'x': x,
'mu=0, sigma=1': x.apply(calc_normal_cdf),
'mu=0, sigma=2': x.apply(lambda x: calc_normal_cdf(x, 0, 2)),
'mu=0, sigma=3': x.apply(lambda x: calc_normal_cdf(x, 0, 3)),
}
).melt(id_vars='x')
normal_cdf| x | variable | value | |
|---|---|---|---|
| 0 | -5.00000 | mu=0, sigma=1 | 2.866516e-07 |
| 1 | -4.98999 | mu=0, sigma=1 | 3.019121e-07 |
| 2 | -4.97998 | mu=0, sigma=1 | 3.179543e-07 |
| 3 | -4.96997 | mu=0, sigma=1 | 3.348164e-07 |
| 4 | -4.95996 | mu=0, sigma=1 | 3.525386e-07 |
| ... | ... | ... | ... |
| 2995 | 4.95996 | mu=0, sigma=3 | 9.508671e-01 |
| 2996 | 4.96997 | mu=0, sigma=3 | 9.512055e-01 |
| 2997 | 4.97998 | mu=0, sigma=3 | 9.515421e-01 |
| 2998 | 4.98999 | mu=0, sigma=3 | 9.518768e-01 |
| 2999 | 5.00000 | mu=0, sigma=3 | 9.522096e-01 |
3000 rows × 3 columns
normal_cdf.describe()['value'].Tcount 3.000000e+03
mean 5.000000e-01
std 3.760737e-01
min 2.866516e-07
25% 1.055739e-01
50% 5.000000e-01
75% 8.944261e-01
max 9.999997e-01
Name: value, dtype: float64
label = alt.selection_single(
encodings=['x'], on='mouseover', nearest=True, empty='none'
)
chart = alt.Chart().mark_line().encode(
alt.X('x:Q'), alt.Y('value:Q'), alt.Color('variable:N')
)
alt.layer(
chart,
chart.mark_circle().encode(opacity=alt.condition(label, alt.value(1), alt.value(0))).add_selection(label),
alt.Chart().mark_rule(color='darkgray').encode(alt.X('x:Q')).transform_filter(label),
chart.mark_text(align='left', dx=5, dy=-5).encode(text=alt.Text('value:Q', format=',.6f')).transform_filter(label),
# tooltip=alt.Tooltip('value:Q'),
data=normal_cdf
).properties(width=600, title="Normal CDFs")