import math
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
Building a light-weight autograd engine. Inspired by Karpathy’s micrograd library.
The Value
Class
littlegrad will closely follow the PyTorch API. PyTorch uses tensors to store values. We’ll create a Value object which can be thought of as an equivalent to the tensor object, except it will only store a single scalar (and of course much less powerful than the tensor).
Let’s begin by adding the addition and multiplication functionality to the Value
object.
class Value:
def __init__(self, data):
self.data = data
def __repr__(self):
return f"Value(data={self.data})"
def __add__(self, other):
= Value(self.data + other.data)
out return out
def __mul__(self, other):
= Value(self.data * other.data)
out return out
= Value(2.0)
a a
Value(data=2.0)
= Value(3.0)
b + b a
Value(data=5.0)
* b a
Value(data=6.0)
Right, so adding and multiplyting two Value objects works correctly. There is a small problem though.
= 3.0
c * c # AttributeError: 'float' object has no attribute 'data' a
AttributeError: 'float' object has no attribute 'data'
We haven’t defined adding/multiplyting a Value
object with an int or float. The fix is simple, if the other number is not a Value
object, wrap it in the Value class.
class Value:
def __init__(self, data):
self.data = data
def __repr__(self):
return f"Value(data={self.data})"
def __add__(self, other):
= other if isinstance(other, Value) else Value(other)
other = Value(self.data + other.data)
out return out
def __mul__(self, other):
= other if isinstance(other, Value) else Value(other)
other = Value(self.data * other.data)
out return out
= Value(2.0); c = 3.0
a * c a
Value(data=6.0)
And it works! But there’s still a catch.
* a # TypeError: unsupported operand type(s) for *: 'float' and 'Value' c
TypeError: unsupported operand type(s) for *: 'float' and 'Value'
Multiplication of a Value object with an int/float only works if the Value operator is the first operand. Thankfully, there’s an easy way out using the __rmul__
dunder method which falls back to x * y
if y * x
doesn’t work.
class Value:
def __init__(self, data):
self.data = data
def __repr__(self):
return f"Value(data={self.data})"
def __add__(self, other):
= other if isinstance(other, Value) else Value(other)
other = Value(self.data + other.data)
out return out
def __mul__(self, other):
= other if isinstance(other, Value) else Value(other)
other = Value(self.data * other.data)
out return out
def __rmul__(self, other): # other * self
return self * other
= Value(2.0); c = 3.0
a * a c
Value(data=6.0)
Since the number of variables and operations get too large too quickly in a neural network, let’s add labels to keep track of the variables and the operations in the Value
class. Also, we’ll keep track of the children of every variable (if x = a + b
, then a
and b
are x
’s children).
class Value:
def __init__(self, data, _children=(), _op='', label=''):
self.data = data
self._prev = _children
self.label = label
self._op = _op
def __repr__(self):
return f"Value(data={self.data})"
def __add__(self, other):
= other if isinstance(other, Value) else Value(other)
other = Value(self.data + other.data, (self, other), '+')
out return out
def __mul__(self, other):
= other if isinstance(other, Value) else Value(other)
other = Value(self.data * other.data, (self, other), '*')
out return out
def __rmul__(self, other): # other * self
return self * other
= Value(2.0, label='a')
a = Value(-3.0, label='b')
b = a + b; c.label = 'c'
c c
Value(data=-1.0)
c._op
'+'
c._prev
(Value(data=2.0), Value(data=-3.0))
Next, we’ll add a .grad
attribute to our Value
class to store the gradients. The gradient, initially, is 0.
class Value:
def __init__(self, data, _children=(), _op='', label=''):
self.data = data
self.grad = 0.0
self._prev = _children
self.label = label
self._op = _op
def __repr__(self):
return f"Value(data={self.data})"
def __add__(self, other):
= other if isinstance(other, Value) else Value(other)
other = Value(self.data + other.data, (self, other), '+')
out return out
def __mul__(self, other):
= other if isinstance(other, Value) else Value(other)
other = Value(self.data * other.data, (self, other), '*')
out return out
def __rmul__(self, other): # other * self
return self * other
Let’s take a slightly more involved example.
= Value(2.0, label='a')
a = Value(-3.0, label='b')
b = Value(10.0, label='c')
c = a*b; e.label = 'e'
e = e + c; d.label = 'd'
d = Value(-2.0, label='f')
f = d * f; L.label = 'L'
L L
Value(data=-8.0)
Visualising the Computational Graph
What we want now is to visualise this sequence of operations in a directed graph. The following code uses the graphviz
library to do just that.
from graphviz import Digraph
def trace(root):
# builds a set of all nodes and edges in a graph
= set(), set()
nodes, edges def build(v):
if v not in nodes:
nodes.add(v)for child in v._prev:
edges.add((child, v))
build(child)
build(root)return nodes, edges
def draw_dot(root):
= Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = left to right
dot
= trace(root)
nodes, edges for n in nodes:
= str(id(n))
uid # for any value in the graph, create a rectangular ('record') node for it
= uid, label = "{ %s | data %.4f | grad %.4f }" % (n.label, n.data, n.grad), shape='record')
dot.node(name if n._op:
# if this value is a result of some operation, create an op node for it
= uid + n._op, label = n._op)
dot.node(name # and connect this node to it
+ n._op, uid)
dot.edge(uid
for n1, n2 in edges:
# connect n1 to the op node of n2
str(id(n1)), str(id(n2)) + n2._op)
dot.edge(
return dot
draw_dot(L)
Warning: Could not load "/home/darkknightxi/miniconda3/bin/../lib/graphviz/libgvplugin_pango.so.6" - It was found, so perhaps one of its dependents was not. Try ldd.
Computing Gradients using Chain Rule
We are now in a position to start computing gradients of L
for the backward pass, with respect to all the variables that preceed it.
Let’s review some basic Differential Calculus.
Consider, \(L = f * d\).
What is \(\frac{dL}{df}\)?. The simple rules of derivatives tells us its, \(d\).
Let’s move a step back.
\(d = c + e\).
We know that \(\frac{dd}{de} = 1.0\).
Now, what’s \(\frac{dL}{de}\) ?
Using the chain rule,
\(\frac{dL}{de} = \frac{dL}{dd} \times \frac{dd}{de} = \frac{dL}{dd} \times 1.0 = \frac{dL}{dd}\)
Similarly, \(\frac{dL}{dc} = \frac{dL}{dd}\).
Using these observations and the Chain Rule in general, we can write a _backward()
function for addition and multiplication which calculates tha gradients of the output with respect to its children.
class Value:
def __init__(self, data, _children=(), _op='', label=''):
self.data = data
self.grad = 0.0
self._backward = lambda: None
self._prev = _children
self.label = label
self._op = _op
def __repr__(self):
return f"Value(data={self.data})"
def __add__(self, other):
= other if isinstance(other, Value) else Value(other)
other = Value(self.data + other.data, (self, other), '+')
out
def _backward():
self.grad += out.grad * 1.0
+= out.grad * 1.0
other.grad = _backward
out._backward return out
def __mul__(self, other):
= other if isinstance(other, Value) else Value(other)
other = Value(self.data * other.data, (self, other), '*')
out
def _backward():
self.grad += other.data * out.grad
+= self.data * out.grad
other.grad = _backward
out._backward return out
def __rmul__(self, other): # other * self
return self * other
= Value(2.0, label='a')
a = Value(-3.0, label='b')
b = Value(10.0, label='c')
c = a*b; e.label = 'e'
e = e + c; d.label = 'd'
d = Value(-2.0, label='f')
f = d * f; L.label = 'L'
L L
Value(data=-8.0)
= 1.0 # init the gradient with 1.0
L.grad ; f._backward(); d._backward(); e._backward(); c._backward(); b._backward(); a._backward() L._backward()
draw_dot(L)
Warning: Could not load "/home/darkknightxi/miniconda3/bin/../lib/graphviz/libgvplugin_pango.so.6" - It was found, so perhaps one of its dependents was not. Try ldd.
Next, let’s create a single backward()
function to compute all the gradients in one go.
Topological Sort
First, we want to convert the above graph into the sequence in which the nodes occur in the graph. This is done using topological sort.
= []
topo = set()
visited def build_topo(v):
if v not in visited:
visited.add(v)for child in v._prev:
build_topo(child)
topo.append(v)
build_topo(L) topo
[Value(data=2.0),
Value(data=-3.0),
Value(data=-6.0),
Value(data=10.0),
Value(data=4.0),
Value(data=-2.0),
Value(data=-8.0)]
The Values are in the correct order now. For the backward pass, we want to iterate this sequence in the reverse order and compute the gradients.
for node in reversed(topo):
node._backward()
Let’s wrap this functionality into a backward()
function in the Value
class.
class Value:
def __init__(self, data, _children=(), _op='', label=''):
self.data = data
self.grad = 0.0
self._backward = lambda: None
self._prev = _children
self.label = label
self._op = _op
def __repr__(self):
return f"Value(data={self.data})"
def __add__(self, other):
= other if isinstance(other, Value) else Value(other)
other = Value(self.data + other.data, (self, other), '+')
out
def _backward():
self.grad += out.grad * 1.0
+= out.grad * 1.0
other.grad = _backward
out._backward return out
def __mul__(self, other):
= other if isinstance(other, Value) else Value(other)
other = Value(self.data * other.data, (self, other), '*')
out
def _backward():
self.grad += other.data * out.grad
+= self.data * out.grad
other.grad = _backward
out._backward return out
def __rmul__(self, other): # other * self
return self * other
def backward(self):
= []
topo = set()
visited def build_topo(v):
if v not in visited:
visited.add(v)for child in v._prev:
build_topo(child)
topo.append(v)self)
build_topo(
self.grad = 1.0
for node in reversed(topo):
node._backward()
= Value(2.0, label='a')
a = Value(-3.0, label='b')
b = Value(10.0, label='c')
c = a*b; e.label = 'e'
e = e + c; d.label = 'd'
d = Value(-2.0, label='f')
f = d * f; L.label = 'L'
L L
Value(data=-8.0)
L.backward()
draw_dot(L)
Warning: Could not load "/home/darkknightxi/miniconda3/bin/../lib/graphviz/libgvplugin_pango.so.6" - It was found, so perhaps one of its dependents was not. Try ldd.
Voila! We’ve successfully implemented the backward pass.
Activation Functions
Next, let’s add some activation functions in the mix. tanh
and relu
are two of the most popular activation functions. \[ \tanh x={\frac {e^{2x}-1}{e^{2x}+1}}.\]
\[ ReLU(x)={\begin{cases}x&;{\text{if }}x>0,\\0&;{\text{otherwise}}\end{cases}}\]
The derivative of \(\tanh\) turns out to be: \[ \frac{d\tanh}{dx} = 1 - \tanh^2(x) \] \(ReLU\)’s derivative is trivial to work out.
def tanh(self):
= self.data
x = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
t = Value(t, (self, ), 'tanh')
out
def _backward():
self.grad += (1 - t**2) * out.grad
= _backward
out._backward return out
def relu(self):
= Value(self.data if self.data > 0 else 0, (self, ), 'ReLU')
out
def _backward():
self.grad += (out.data > 0) * out.grad
= _backward
out._backward return out
The AutoGrad Engine
Here’s what our Value
class now looks like, in its full glory. The Value
class is now a full-fledged autograd engine, ready to train neural networks (See part 2 for training neural nets).
class Value:
def __init__(self, data, _children=(), _op='', label=''):
self.data = data
self.grad = 0.0
self._backward = lambda: None
self._prev = _children
self.label = label
self._op = _op
def __add__(self, other):
= other if isinstance(other, Value) else Value(other)
other = Value(self.data + other.data, (self, other), '+')
out
def _backward():
self.grad += out.grad * 1.0
+= out.grad * 1.0
other.grad = _backward
out._backward return out
def __mul__(self, other):
= other if isinstance(other, Value) else Value(other)
other = Value(self.data * other.data, (self, other), '*')
out
def _backward():
self.grad += other.data * out.grad
+= self.data * out.grad
other.grad = _backward
out._backward return out
def __rmul__(self, other): # other * self
return self * other
def tanh(self):
= self.data
x = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
t = Value(t, (self, ), 'tanh')
out
def _backward():
self.grad += (1 - t**2) * out.grad
= _backward
out._backward return out
def relu(self):
= Value(self.data if self.data > 0 else 0, (self, ), 'ReLU')
out
def _backward():
self.grad += (out.data > 0) * out.grad
= _backward
out._backward return out
def backward(self):
= []
topo = set()
visited def build_topo(v):
if v not in visited:
visited.add(v)for child in v._prev:
build_topo(child)
topo.append(v)self)
build_topo(
self.grad = 1.0
for node in reversed(topo):
node._backward()
def __repr__(self):
return f"Value(data={self.data}, grad={self.grad})"
Example using a Neural Network
Let’s test our engine on a toy neural network.
# inputs x1,x2
= Value(2.0, label='x1')
x1 = Value(0.0, label='x2')
x2 # weights w1,w2
= Value(-3.0, label='w1')
w1 = Value(1.0, label='w2')
w2 # bias of the neuron
= Value(6.8813735870195432, label='b')
b # x1*w1 + x2*w2 + b
= x1*w1; x1w1.label = 'x1*w1'
x1w1 = x2*w2; x2w2.label = 'x2*w2'
x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1*w1 + x2*w2'
x1w1x2w2 = x1w1x2w2 + b; n.label = 'n'
n = n.tanh(); o.label = 'o' o
draw_dot(o)
Warning: Could not load "/home/darkknightxi/miniconda3/bin/../lib/graphviz/libgvplugin_pango.so.6" - It was found, so perhaps one of its dependents was not. Try ldd.
o.backward()
draw_dot(o)
Warning: Could not load "/home/darkknightxi/miniconda3/bin/../lib/graphviz/libgvplugin_pango.so.6" - It was found, so perhaps one of its dependents was not. Try ldd.