Gradient descent from scratch with Python

fastai
Published

October 20, 2020

Gradient descent from scratch with Python

From the Data Science from Scratch book.

Libraries and helper functions

import random
from typing import List, Callable

import pandas as pd
import altair as alt
Vector = List[float]
Vector
typing.List[float]
def add(vector1: Vector, vector2: Vector) -> Vector:
    assert len(vector1) == len(vector2)
    return [v1 + v2 for v1, v2 in zip(vector1, vector2)]
def scalar_multiply(c: float, vector: Vector) -> Vector:
    return [c * v for v in vector]
def dot(vector1: Vector, vector2: Vector) -> float:
    assert len(vector1) == len(vector2)

    return sum(v1 * v2 for v1, v2 in zip(vector1, vector2))
def sum_of_squares(v: Vector) -> Vector:
    return dot(v, v)
def magnitude(v: Vector) -> Vector:
    return m.sqrt(sum_of_squares(v))
def distance(vector1: Vector, vector2: Vector) -> Vector:
    return magnitude(subtract(vector1, vector2))
def vector_sum(vectors: List[Vector]) -> Vector:
    assert vectors
    
    vector_length = len(vectors[0])
    assert all(len(v) == vector_length for v in vectors)

    sums = [0] * vector_length
    for vector in vectors:
        sums = add(sums, vector)

    return sums
def vector_mean(vectors: List[Vector]) -> Vector:
    n = len(vectors)
    return scalar_multiply(1/n, vector_sum(vectors))

Estimate gradient

def estimate_gradient(
    f: Callable[[Vector], float],
    v: Vector,
    h: float = 0.0001
):
    return [
        partial_difference_quotient(f, v, i, h)
        for i in range(len(v))
    ]
v = [random.uniform(-10, 10) for i in range(3)]
v
[-9.684378319623278, 4.813838863175313, 3.1311841279856303]

Adjusting with gradients

def gradient_step(v: Vector, gradient: Vector, step_size: float) -> Vector:
    """Return vector adjusted with step. Step is gradient times step size.
    """
    step = scalar_multiply(step_size, gradient)
    return add(v, step)

v, gradient_step(v, [1, 1, 1], step_size=0.1)
([-9.684378319623278, 4.813838863175313, 3.1311841279856303],
 [-9.584378319623278, 4.913838863175313, 3.2311841279856304])
def sum_of_squares_gradient(v: Vector) -> Vector:
    return [2 * v_i for v_i in v]

sum_of_squares_gradient(v)
[-19.368756639246556, 9.627677726350626, 6.262368255971261]
steps = pd.DataFrame()

for epoch in range(1000):
    grad = sum_of_squares_gradient(v)
    v = gradient_step(v, grad, -0.01)
    steps = steps.append(
        {'x': v[0], 'y': v[1], 'z': v[2]}, ignore_index=True
    )

steps
x y z
0 -9.490691e+00 4.717562e+00 3.068560e+00
1 -9.300877e+00 4.623211e+00 3.007189e+00
2 -9.114859e+00 4.530747e+00 2.947045e+00
3 -8.932562e+00 4.440132e+00 2.888105e+00
4 -8.753911e+00 4.351329e+00 2.830342e+00
... ... ... ...
995 -1.767027e-08 8.783406e-09 5.713207e-09
996 -1.731686e-08 8.607737e-09 5.598943e-09
997 -1.697053e-08 8.435583e-09 5.486964e-09
998 -1.663111e-08 8.266871e-09 5.377225e-09
999 -1.629849e-08 8.101534e-09 5.269681e-09

1000 rows × 3 columns

alt.Chart(steps.reset_index().melt(id_vars='index')).mark_point().encode(
    alt.X('index:Q'), alt.Y('value:Q'), alt.Color('variable:N')
).properties(title='Gradient steps')

Changing the step size

inputs = [(x, 20 * x + 5) for x in range(-50, 50)]
inputs[:20]
[(-50, -995),
 (-49, -975),
 (-48, -955),
 (-47, -935),
 (-46, -915),
 (-45, -895),
 (-44, -875),
 (-43, -855),
 (-42, -835),
 (-41, -815),
 (-40, -795),
 (-39, -775),
 (-38, -755),
 (-37, -735),
 (-36, -715),
 (-35, -695),
 (-34, -675),
 (-33, -655),
 (-32, -635),
 (-31, -615)]

We use a simple linear gradient

def linear_gradient(x: float, y: float, theta: Vector) -> Vector:
    slope, intercept = theta
    predicted = slope * x + intercept
    error = (predicted - y) #** 2
    return [2 * error * x, 2 * error]
    

x, y = inputs[0][0], inputs[0][1]
theta = [random.uniform(-1, 1), random.uniform(-1, 1)]

x, y, theta, linear_gradient(x, y, theta)    
(-50,
 -995,
 [-0.0327443167639494, -0.6719198550797425],
 [-99596.52959831177, 1991.9305919662354])
theta = [random.uniform(-1, 1), random.uniform(-1, 1)]

learning_rate = 0.001

for epoch in range(20):
    grad = vector_mean([linear_gradient(x, y, theta) for x, y in inputs])
    theta = gradient_step(theta, grad, -learning_rate)
    print(epoch, theta)
0 [32.89133392765593, 0.47755932457342315]
1 [11.396957829578067, 0.4994955398519324]
2 [25.733728623211277, 0.49989350660180654]
3 [16.171102901824685, 0.5146274482118142]
4 [22.549388991931146, 0.5197692962172151]
5 [18.295077311678142, 0.5312791466167119]
6 [21.132714712257297, 0.5385116656351566]
7 [19.24001779859002, 0.5485673570161436]
8 [20.50245669569747, 0.5567102401007012]
9 [19.660418094209888, 0.5660992763161973]
10 [20.22206723043832, 0.5746274958577747]
11 [19.8474557847935, 0.5837003080964975]
12 [20.097330691850832, 0.592380363265098]
13 [19.93067280889876, 0.6012929332304187]
14 [20.04184252939776, 0.6100210201728566]
15 [19.967701053911867, 0.6188428206619087]
16 [20.017162239861445, 0.6275728360744967]
17 [19.98418035884849, 0.6363348526422091]
18 [20.0061880355007, 0.6450463632957731]
19 [19.99151762668433, 0.6537624586046823]