Central Limit Theorem on the binomial distribution from scratch with Python

fastai

Published

October 3, 2020

Central Limit Theorem on the binomial distribution from scratch with Python

Here I am playing with the Central Limit Theorem.

I am running bernoulli trials to generate binomial distributions.

I was interested in how the results approximate the normal distribution, so I labelled their status at particular iteration rounds and plotted the result.

The code is based upon the respective example from Data Science from Scratch.

Libraries

import random
import pandas as pd
import numpy as np
import altair as alt
import math as m
from collections import Counter

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable(&#39;default&#39;)

Parameters

p = .35
binomial_trials = 10000
total_rounds = 5000
step = 100

Bernoulli Trials

def bernoulli_trial(p: float) -> int:
    return 1 if random.random() < p else 0

Let’s run the trials many times.

trials =  []
for _ in range(binomial_trials):
    trials.append(bernoulli_trial(p))

print("First thousand trials:")
print(" ".join([str(t) for t in trials[:1000]]))
print(f"\nOnes: {sum([t == 1 for t in trials])}, Zeros: {sum([t == 0 for t in trials])}")
print("\nMean value of 'success' (ones):", np.mean(trials))

First thousand trials:
0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 1 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 1 0 1 0 1 0 1 1 0 1 0 0 1 1 0 1 0 1 1 0 0 1 1 1 1 1 0 1 1 0 0 0 1 1 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 1 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 1 0 0 1 1 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 1 1 0 1 1 1 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 1 1 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 1 1 0 0 0 1 0 1 0 1 1 0 0 0 1 1 0 1 0 0 1 1 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 1 0 1 1 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0 1 1 1 0 0 0 0 1 1 0 1 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 1 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 0 1 1 0 0 0 0 1 1 0 0 0 1 0 0 1 1 0 1 1 1 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 1 0 1 0 0 0 1 1 0 1 1 1 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 1 1 0 0 1 1 1 0 1 0 1 0 1 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 1 1 0 0 1 1 0 0 0 1 0 0 1 0 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 1 1 1 0 1 0 0 1 1 0 0 0 0 0 1 1 1 1 1 0 0 0 1 0 0 0 1 1 0 0 0 1 1 0 0 0 1 1 0 1 0 1 0 1 0 1 0 0 0 1 0 1 0 1 1 1 1 1 1 0 1 1 0 0 1 0 1 0 1 0 1 1 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 1 1 0 1 1 0 0 1 0 0 0 0 0 1 0 0 1 0 1 1 1 1 0 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 0 1 1 0 1 0 1 0 0 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 1 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 1 1 0 1 0 0 1 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 1 1 0 1 1 0 0 1 1 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 1

Ones: 3594, Zeros: 6406

Mean value of &#39;success&#39; (ones): 0.3594

def binomial(n: int, p: float) -> int:
    return sum(bernoulli_trial(p) for _ in range(n))

trials = binomial(binomial_trials, p)
trials

Iterating the binomial

def iterate_binomial(n, p, num_points):
    return [binomial(n, p) for _ in range(num_points)]

simulations = iterate_binomial(binomial_trials, p, total_rounds)

print("First hunded values:")
print(" ".join([str(results) for results in simulations[:100]]))
print("\nMean:", np.mean(simulations), "Standard deviation:", np.std(simulations))

First hunded values:
3483 3596 3442 3470 3479 3570 3521 3512 3470 3499 3446 3495 3530 3527 3485 3537 3445 3538 3478 3542 3510 3444 3478 3384 3452 3566 3538 3548 3485 3425 3547 3456 3411 3515 3466 3454 3428 3485 3514 3524 3411 3530 3559 3401 3459 3513 3580 3533 3527 3502 3476 3465 3478 3475 3493 3537 3558 3499 3511 3496 3572 3555 3480 3541 3519 3477 3460 3496 3501 3524 3489 3499 3537 3527 3557 3508 3503 3530 3489 3497 3519 3509 3502 3484 3499 3488 3537 3567 3395 3469 3537 3440 3542 3493 3385 3539 3490 3504 3398 3555

Mean: 3500.871 Standard deviation: 47.232848304966744

As we are interested in the intermediate states, we add new rounds iteratively and labeling them in the way.

def add_iterations(probability, trials, total_rounds, step):
    iteration_results = pd.DataFrame()

    for round in range(step, total_rounds + 1, step):
        simulations = iterate_binomial(trials, probability, round)

        counts = Counter(simulations)
        iteration_results = pd.concat(
            [
                iteration_results,
                pd.DataFrame({'count': counts.values(), 'value': counts.keys(), 'iterations': round})
            ]
        )

    return iteration_results


iteration_results = add_iterations(p, binomial_trials, total_rounds, step)
iteration_results

	count	value	iterations
0	1	3447	100
1	1	3457	100
2	1	3530	100
3	1	3496	100
4	1	3551	100
...	...	...	...
265	2	3410	5000
266	1	3663	5000
267	1	3372	5000
268	1	3644	5000
269	1	3622	5000

11949 rows × 3 columns

As we are interested in the total counts at each step we need to cumulatively add the value counts together over the iteration rounds.

def sum_counts(value_df):
    return value_df.sort_values('iterations')['count'].cumsum()

iteration_results = iteration_results.sort_values(['value', 'iterations'])

iteration_results['cumulative count'] = iteration_results.groupby('value').apply(sum_counts).values

iteration_results

	count	value	iterations	cumulative count
233	1	3285	4400	1
200	1	3286	4700	1
161	1	3288	1900	1
244	1	3313	2200	1
156	1	3314	3200	1
...	...	...	...	...
130	1	3691	4700	1
240	1	3692	4100	1
144	1	3695	5000	1
155	1	3700	2400	1
271	1	3704	4500	1

11949 rows × 4 columns

Normal distributions

We also generate a normal distribution for reference

def calc_normal_cdf(x: float, mu: float = 0, sigma: float = 1) -> float:
    return (1 + m.erf((x - mu) / m.sqrt(2) / sigma)) /2

def calc_normal_pdf(x: float, mu: float = 0, sigma: float=1) -> float:
    return m.exp(-(x-mu)**2 / (2 * sigma **2)) * 1/(m.sqrt(2 * m.pi) * sigma)

n = binomial_trials

mu = p * n
sigma = m.sqrt(n * p * (1 - p))

data = iteration_results.loc[iteration_results['iterations'] == iteration_results['iterations'].max(), 'value']

xs = range(min(data), max(data) + 1)
ys = [calc_normal_cdf(i + .5, mu, sigma) - calc_normal_cdf(i - .5, mu, sigma) for i in xs]

normal_dist = pd.DataFrame({'x': xs, 'y':ys})
normal_dist

	x	y
0	3347	0.000049
1	3348	0.000052
2	3349	0.000056
3	3350	0.000060
4	3351	0.000064
...	...	...
344	3691	0.000003
345	3692	0.000003
346	3693	0.000002
347	3694	0.000002
348	3695	0.000002

349 rows × 2 columns

Visualization

With the slider you can see the intermediate results at each iteration.

label = alt.selection_single(
    encodings=['x'], on='mouseover', nearest=True, empty='none'
)


iteration_bounds = alt.binding_range(min=iteration_results['iterations'].min(), max=iteration_results['iterations'].max(), step=step)
iteration_slider = alt.selection_single(bind=iteration_bounds, fields=['iterations'], name='Iteration', init={'iterations': iteration_results['iterations'].median()})

bar_chart = alt.Chart(iteration_results).mark_bar().encode(
    alt.X('value:Q', scale=alt.Scale(domain=(iteration_results['value'].min(), iteration_results['value'].max()))),
    alt.Y('cumulative count:Q', scale=alt.Scale(domain=(iteration_results['cumulative count'].min(), iteration_results['cumulative count'].max()))),
    opacity=alt.value(0.75)
)


line_chart = alt.Chart(normal_dist).mark_line().encode(
    alt.X('x'), alt.Y('y',),color=alt.value('darkred')
)

bar_chart = alt.layer(
    bar_chart.add_selection(iteration_slider).transform_filter(iteration_slider),
    bar_chart.mark_circle().encode(opacity=alt.condition(label, alt.value(1), alt.value(0))).add_selection(label).transform_filter(iteration_slider),
    bar_chart.mark_text(dx=5, dy=-5, align='left', strokeWidth=.5, stroke='black').encode(text=alt.Text('cumulative count:Q')).transform_filter(label).transform_filter(iteration_slider)
)

alt.layer(
    bar_chart,
    line_chart,    
).resolve_scale(y='independent').properties(width=800)