import random
import pandas as pd
import numpy as np
import altair as alt
import math as m
from collections import CounterCentral Limit Theorem on the binomial distribution from scratch with Python
Central Limit Theorem on the binomial distribution from scratch with Python
Here I am playing with the Central Limit Theorem.
I am running bernoulli trials to generate binomial distributions.
I was interested in how the results approximate the normal distribution, so I labelled their status at particular iteration rounds and plotted the result.
The code is based upon the respective example from Data Science from Scratch.
Libraries
alt.data_transformers.disable_max_rows()DataTransformerRegistry.enable('default')
Parameters
p = .35
binomial_trials = 10000
total_rounds = 5000
step = 100Bernoulli Trials
def bernoulli_trial(p: float) -> int:
return 1 if random.random() < p else 0Let’s run the trials many times.
trials = []
for _ in range(binomial_trials):
trials.append(bernoulli_trial(p))
print("First thousand trials:")
print(" ".join([str(t) for t in trials[:1000]]))
print(f"\nOnes: {sum([t == 1 for t in trials])}, Zeros: {sum([t == 0 for t in trials])}")
print("\nMean value of 'success' (ones):", np.mean(trials))First thousand trials:
0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 1 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 1 0 1 0 1 0 1 1 0 1 0 0 1 1 0 1 0 1 1 0 0 1 1 1 1 1 0 1 1 0 0 0 1 1 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 1 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 1 0 0 1 1 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 1 1 0 1 1 1 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 1 1 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 1 1 0 0 0 1 0 1 0 1 1 0 0 0 1 1 0 1 0 0 1 1 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 1 0 1 1 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0 1 1 1 0 0 0 0 1 1 0 1 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 1 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 0 1 1 0 0 0 0 1 1 0 0 0 1 0 0 1 1 0 1 1 1 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 1 0 1 0 0 0 1 1 0 1 1 1 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 1 1 0 0 1 1 1 0 1 0 1 0 1 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 1 1 0 0 1 1 0 0 0 1 0 0 1 0 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 1 1 1 0 1 0 0 1 1 0 0 0 0 0 1 1 1 1 1 0 0 0 1 0 0 0 1 1 0 0 0 1 1 0 0 0 1 1 0 1 0 1 0 1 0 1 0 0 0 1 0 1 0 1 1 1 1 1 1 0 1 1 0 0 1 0 1 0 1 0 1 1 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 1 1 0 1 1 0 0 1 0 0 0 0 0 1 0 0 1 0 1 1 1 1 0 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 0 1 1 0 1 0 1 0 0 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 1 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 1 1 0 1 0 0 1 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 1 1 0 1 1 0 0 1 1 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 1
Ones: 3594, Zeros: 6406
Mean value of 'success' (ones): 0.3594
def binomial(n: int, p: float) -> int:
return sum(bernoulli_trial(p) for _ in range(n))
trials = binomial(binomial_trials, p)
trials3441
Iterating the binomial
def iterate_binomial(n, p, num_points):
return [binomial(n, p) for _ in range(num_points)]
simulations = iterate_binomial(binomial_trials, p, total_rounds)
print("First hunded values:")
print(" ".join([str(results) for results in simulations[:100]]))
print("\nMean:", np.mean(simulations), "Standard deviation:", np.std(simulations))First hunded values:
3483 3596 3442 3470 3479 3570 3521 3512 3470 3499 3446 3495 3530 3527 3485 3537 3445 3538 3478 3542 3510 3444 3478 3384 3452 3566 3538 3548 3485 3425 3547 3456 3411 3515 3466 3454 3428 3485 3514 3524 3411 3530 3559 3401 3459 3513 3580 3533 3527 3502 3476 3465 3478 3475 3493 3537 3558 3499 3511 3496 3572 3555 3480 3541 3519 3477 3460 3496 3501 3524 3489 3499 3537 3527 3557 3508 3503 3530 3489 3497 3519 3509 3502 3484 3499 3488 3537 3567 3395 3469 3537 3440 3542 3493 3385 3539 3490 3504 3398 3555
Mean: 3500.871 Standard deviation: 47.232848304966744
As we are interested in the intermediate states, we add new rounds iteratively and labeling them in the way.
def add_iterations(probability, trials, total_rounds, step):
iteration_results = pd.DataFrame()
for round in range(step, total_rounds + 1, step):
simulations = iterate_binomial(trials, probability, round)
counts = Counter(simulations)
iteration_results = pd.concat(
[
iteration_results,
pd.DataFrame({'count': counts.values(), 'value': counts.keys(), 'iterations': round})
]
)
return iteration_results
iteration_results = add_iterations(p, binomial_trials, total_rounds, step)
iteration_results| count | value | iterations | |
|---|---|---|---|
| 0 | 1 | 3447 | 100 |
| 1 | 1 | 3457 | 100 |
| 2 | 1 | 3530 | 100 |
| 3 | 1 | 3496 | 100 |
| 4 | 1 | 3551 | 100 |
| ... | ... | ... | ... |
| 265 | 2 | 3410 | 5000 |
| 266 | 1 | 3663 | 5000 |
| 267 | 1 | 3372 | 5000 |
| 268 | 1 | 3644 | 5000 |
| 269 | 1 | 3622 | 5000 |
11949 rows × 3 columns
As we are interested in the total counts at each step we need to cumulatively add the value counts together over the iteration rounds.
def sum_counts(value_df):
return value_df.sort_values('iterations')['count'].cumsum()
iteration_results = iteration_results.sort_values(['value', 'iterations'])
iteration_results['cumulative count'] = iteration_results.groupby('value').apply(sum_counts).values
iteration_results| count | value | iterations | cumulative count | |
|---|---|---|---|---|
| 233 | 1 | 3285 | 4400 | 1 |
| 200 | 1 | 3286 | 4700 | 1 |
| 161 | 1 | 3288 | 1900 | 1 |
| 244 | 1 | 3313 | 2200 | 1 |
| 156 | 1 | 3314 | 3200 | 1 |
| ... | ... | ... | ... | ... |
| 130 | 1 | 3691 | 4700 | 1 |
| 240 | 1 | 3692 | 4100 | 1 |
| 144 | 1 | 3695 | 5000 | 1 |
| 155 | 1 | 3700 | 2400 | 1 |
| 271 | 1 | 3704 | 4500 | 1 |
11949 rows × 4 columns
Normal distributions
We also generate a normal distribution for reference
def calc_normal_cdf(x: float, mu: float = 0, sigma: float = 1) -> float:
return (1 + m.erf((x - mu) / m.sqrt(2) / sigma)) /2
def calc_normal_pdf(x: float, mu: float = 0, sigma: float=1) -> float:
return m.exp(-(x-mu)**2 / (2 * sigma **2)) * 1/(m.sqrt(2 * m.pi) * sigma)n = binomial_trials
mu = p * n
sigma = m.sqrt(n * p * (1 - p))
data = iteration_results.loc[iteration_results['iterations'] == iteration_results['iterations'].max(), 'value']
xs = range(min(data), max(data) + 1)
ys = [calc_normal_cdf(i + .5, mu, sigma) - calc_normal_cdf(i - .5, mu, sigma) for i in xs]
normal_dist = pd.DataFrame({'x': xs, 'y':ys})
normal_dist| x | y | |
|---|---|---|
| 0 | 3347 | 0.000049 |
| 1 | 3348 | 0.000052 |
| 2 | 3349 | 0.000056 |
| 3 | 3350 | 0.000060 |
| 4 | 3351 | 0.000064 |
| ... | ... | ... |
| 344 | 3691 | 0.000003 |
| 345 | 3692 | 0.000003 |
| 346 | 3693 | 0.000002 |
| 347 | 3694 | 0.000002 |
| 348 | 3695 | 0.000002 |
349 rows × 2 columns
Visualization
With the slider you can see the intermediate results at each iteration.
label = alt.selection_single(
encodings=['x'], on='mouseover', nearest=True, empty='none'
)
iteration_bounds = alt.binding_range(min=iteration_results['iterations'].min(), max=iteration_results['iterations'].max(), step=step)
iteration_slider = alt.selection_single(bind=iteration_bounds, fields=['iterations'], name='Iteration', init={'iterations': iteration_results['iterations'].median()})
bar_chart = alt.Chart(iteration_results).mark_bar().encode(
alt.X('value:Q', scale=alt.Scale(domain=(iteration_results['value'].min(), iteration_results['value'].max()))),
alt.Y('cumulative count:Q', scale=alt.Scale(domain=(iteration_results['cumulative count'].min(), iteration_results['cumulative count'].max()))),
opacity=alt.value(0.75)
)
line_chart = alt.Chart(normal_dist).mark_line().encode(
alt.X('x'), alt.Y('y',),color=alt.value('darkred')
)
bar_chart = alt.layer(
bar_chart.add_selection(iteration_slider).transform_filter(iteration_slider),
bar_chart.mark_circle().encode(opacity=alt.condition(label, alt.value(1), alt.value(0))).add_selection(label).transform_filter(iteration_slider),
bar_chart.mark_text(dx=5, dy=-5, align='left', strokeWidth=.5, stroke='black').encode(text=alt.Text('cumulative count:Q')).transform_filter(label).transform_filter(iteration_slider)
)
alt.layer(
bar_chart,
line_chart,
).resolve_scale(y='independent').properties(width=800)