Confidence intervals from scratch with Python

fastai

Published

October 14, 2020

Confidence intervals from scratch with Python

From the Data Science from Scratch book.

Libraries and helper functions

import math as m

def calc_normal_cdf(x: float, mu: float = 0, sigma: float = 1) -> float:
    return (1 + m.erf((x - mu) / m.sqrt(2) / sigma)) / 2

def calc_inverse_normal_cdf(p: float, mu:float = 0, sigma: float = 1, tolerance: float = 1E-5, show_steps=False) -> float:

    if p == 0: return -np.inf
    if p == 1: return np.inf

    # In case it is not a standard normal distribution, calculate the standard normal first and then rescale
    if mu != 0 or sigma != 1:
        return mu + sigma * calc_inverse_normal_cdf(p, tolerance=tolerance)

    low_z = -10
    hi_z = 10

    if show_steps: print(f"{'':<19}".join(['low_z', 'mid_z', 'hi_z']), "\n")

    while hi_z - low_z > tolerance:
        mid_z = (low_z + hi_z) / 2
        mid_p = calc_normal_cdf(mid_z)

        if mid_p < p:
            low_z = mid_z
        else:
            hi_z = mid_z

        if show_steps: print("\t".join(map(to_string, [low_z, mid_z, hi_z])))

    return mid_z

def normal_upper_bound(probabilty: float, mu: float = 0, sigma: float = 1) -> float:
    """Return z for which P(Z <= z) = probability"""
    return calc_inverse_normal_cdf(probabilty, mu, sigma)

def normal_lower_bound(probabilty: float, mu: float = 0, sigma: float = 1) -> float:
    """Return z for which P(Z >= z) = probability"""
    return calc_inverse_normal_cdf(1 - probabilty, mu, sigma)

def normal_two_sided_bounds(probability: float, mu: float = 0, sigma: float = 1) -> float:
    if probability == 0: return 0, 0

    tail_probability = (1 - probability) / 2
    
    lower_bound = normal_upper_bound(tail_probability, mu, sigma)
    upper_bound = normal_lower_bound(tail_probability, mu, sigma)

    return lower_bound, upper_bound

Example 1

We would like to know if a coin is ‘fair’ or not (i.e. p=0.5)

We flip the coin 1000 times and observe 525 heads.

As we do not know the ‘real’ probability, we need to rely on our observed results.

p_hat = 525 / 1000
mu = p_hat
sigma = m.sqrt(p_hat * (1 - p_hat) / 1000)

p_hat, mu, sigma

(0.525, 0.525, 0.015791611697353755)

While we cannot tell if this is really the probabiliy of heads, we can calculate the interval within which we expect the real probability fall with our chosen confidence (here: 95%)

confidence = 0.95
normal_two_sided_bounds(confidence, mu, sigma)

(0.4940490278129096, 0.5559509721870904)

As 0.5 is within the interval, we cannot reject the hypothesis that the coin is fair.

Example 2

In another example we observe 540 heads in 1000 flips.

p_hat = 540 / 1000
mu = p_hat
sigma = m.sqrt(p_hat * (1 - p_hat) / 1000)

p_hat, mu, sigma

(0.54, 0.54, 0.015760710643876435)

Going through the same steps as above, we get that 0.5 falls outside our 95% confidence interval and therefore we reject the hypothesis that the coin is ‘fair’.

normal_two_sided_bounds(confidence, mu, sigma)

(0.5091095927295919, 0.5708904072704082)