import random
from typing import List, Callable
import pandas as pd
import altair as altGradient descent from scratch with Python
fastai
Gradient descent from scratch with Python
From the Data Science from Scratch book.
Libraries and helper functions
Vector = List[float]
Vectortyping.List[float]
def add(vector1: Vector, vector2: Vector) -> Vector:
assert len(vector1) == len(vector2)
return [v1 + v2 for v1, v2 in zip(vector1, vector2)]def scalar_multiply(c: float, vector: Vector) -> Vector:
return [c * v for v in vector]def dot(vector1: Vector, vector2: Vector) -> float:
assert len(vector1) == len(vector2)
return sum(v1 * v2 for v1, v2 in zip(vector1, vector2))def sum_of_squares(v: Vector) -> Vector:
return dot(v, v)def magnitude(v: Vector) -> Vector:
return m.sqrt(sum_of_squares(v))def distance(vector1: Vector, vector2: Vector) -> Vector:
return magnitude(subtract(vector1, vector2))def vector_sum(vectors: List[Vector]) -> Vector:
assert vectors
vector_length = len(vectors[0])
assert all(len(v) == vector_length for v in vectors)
sums = [0] * vector_length
for vector in vectors:
sums = add(sums, vector)
return sumsdef vector_mean(vectors: List[Vector]) -> Vector:
n = len(vectors)
return scalar_multiply(1/n, vector_sum(vectors))Estimate gradient
def estimate_gradient(
f: Callable[[Vector], float],
v: Vector,
h: float = 0.0001
):
return [
partial_difference_quotient(f, v, i, h)
for i in range(len(v))
]v = [random.uniform(-10, 10) for i in range(3)]
v[-9.684378319623278, 4.813838863175313, 3.1311841279856303]
Adjusting with gradients
def gradient_step(v: Vector, gradient: Vector, step_size: float) -> Vector:
"""Return vector adjusted with step. Step is gradient times step size.
"""
step = scalar_multiply(step_size, gradient)
return add(v, step)
v, gradient_step(v, [1, 1, 1], step_size=0.1)([-9.684378319623278, 4.813838863175313, 3.1311841279856303],
[-9.584378319623278, 4.913838863175313, 3.2311841279856304])
def sum_of_squares_gradient(v: Vector) -> Vector:
return [2 * v_i for v_i in v]
sum_of_squares_gradient(v)[-19.368756639246556, 9.627677726350626, 6.262368255971261]
steps = pd.DataFrame()
for epoch in range(1000):
grad = sum_of_squares_gradient(v)
v = gradient_step(v, grad, -0.01)
steps = steps.append(
{'x': v[0], 'y': v[1], 'z': v[2]}, ignore_index=True
)
steps| x | y | z | |
|---|---|---|---|
| 0 | -9.490691e+00 | 4.717562e+00 | 3.068560e+00 |
| 1 | -9.300877e+00 | 4.623211e+00 | 3.007189e+00 |
| 2 | -9.114859e+00 | 4.530747e+00 | 2.947045e+00 |
| 3 | -8.932562e+00 | 4.440132e+00 | 2.888105e+00 |
| 4 | -8.753911e+00 | 4.351329e+00 | 2.830342e+00 |
| ... | ... | ... | ... |
| 995 | -1.767027e-08 | 8.783406e-09 | 5.713207e-09 |
| 996 | -1.731686e-08 | 8.607737e-09 | 5.598943e-09 |
| 997 | -1.697053e-08 | 8.435583e-09 | 5.486964e-09 |
| 998 | -1.663111e-08 | 8.266871e-09 | 5.377225e-09 |
| 999 | -1.629849e-08 | 8.101534e-09 | 5.269681e-09 |
1000 rows × 3 columns
alt.Chart(steps.reset_index().melt(id_vars='index')).mark_point().encode(
alt.X('index:Q'), alt.Y('value:Q'), alt.Color('variable:N')
).properties(title='Gradient steps')Changing the step size
inputs = [(x, 20 * x + 5) for x in range(-50, 50)]
inputs[:20][(-50, -995),
(-49, -975),
(-48, -955),
(-47, -935),
(-46, -915),
(-45, -895),
(-44, -875),
(-43, -855),
(-42, -835),
(-41, -815),
(-40, -795),
(-39, -775),
(-38, -755),
(-37, -735),
(-36, -715),
(-35, -695),
(-34, -675),
(-33, -655),
(-32, -635),
(-31, -615)]
We use a simple linear gradient
def linear_gradient(x: float, y: float, theta: Vector) -> Vector:
slope, intercept = theta
predicted = slope * x + intercept
error = (predicted - y) #** 2
return [2 * error * x, 2 * error]
x, y = inputs[0][0], inputs[0][1]
theta = [random.uniform(-1, 1), random.uniform(-1, 1)]
x, y, theta, linear_gradient(x, y, theta) (-50,
-995,
[-0.0327443167639494, -0.6719198550797425],
[-99596.52959831177, 1991.9305919662354])
theta = [random.uniform(-1, 1), random.uniform(-1, 1)]
learning_rate = 0.001
for epoch in range(20):
grad = vector_mean([linear_gradient(x, y, theta) for x, y in inputs])
theta = gradient_step(theta, grad, -learning_rate)
print(epoch, theta)0 [32.89133392765593, 0.47755932457342315]
1 [11.396957829578067, 0.4994955398519324]
2 [25.733728623211277, 0.49989350660180654]
3 [16.171102901824685, 0.5146274482118142]
4 [22.549388991931146, 0.5197692962172151]
5 [18.295077311678142, 0.5312791466167119]
6 [21.132714712257297, 0.5385116656351566]
7 [19.24001779859002, 0.5485673570161436]
8 [20.50245669569747, 0.5567102401007012]
9 [19.660418094209888, 0.5660992763161973]
10 [20.22206723043832, 0.5746274958577747]
11 [19.8474557847935, 0.5837003080964975]
12 [20.097330691850832, 0.592380363265098]
13 [19.93067280889876, 0.6012929332304187]
14 [20.04184252939776, 0.6100210201728566]
15 [19.967701053911867, 0.6188428206619087]
16 [20.017162239861445, 0.6275728360744967]
17 [19.98418035884849, 0.6363348526422091]
18 [20.0061880355007, 0.6450463632957731]
19 [19.99151762668433, 0.6537624586046823]