Andrej Karpathy-MicroGrad

3 minute read

2.5 hr video of micrgrad. I wish I could’ve watched this video 5 yrs earlier! It clears out so many questions about loss.backward()!

1 Some Python notes

I actually learn quite some about Python operations here :)

def __repr__(self):
  #  will be used to call the object itself

def __add__(self, other):
  # a+b == a.__add__(b)
  # make sure a + 1 works
  other = other if isinstance(other, Value) else Value(other)
# Make sure 1 + a works
def __radd__(self, other): # other + self
  return self + other 

def __mul__(self, other):
  # make sure a * 2 works
  other = other if isinstance(other, Value) else Value(other)
  # a*b == a.__mul__(b)
# Make sure 2 * a works
def __rmul__(self, other): # other * self
  return self * other 

# Define division
def __truediv__(self, other): #self / other
  return self * other ** -1

# Define substraction
def __neg__(self): #-self
  return self * -1
def __sub__(self, other): # self - other
  return self + (-other)

2 Gradients

The fundamental idea is to apply chain rules. Here are some exampels of find gradients of exp and power functions

def exp(self):
  x = self.data
  out = Value(math.exp(x), (self, ), 'exp')
  def _backward():
    self.grad += out.data * out.grad
  out._backward = _backward
  return out

def __pow__(self, other):
  assert isinstance(other, (int, float)), "only supporting int/float powers for now"
  out = Value(self.data**other, (self,), f'**{other}')
  def _backward():
      self.grad += other * (self.data ** (other - 1)) * out.grad
  out._backward = _backward

  return out

3 Backpropogation

  • Define _backprop for each operation, to get the grad for self and/or other
def __add__(self, other):
  ...
  def _backward():
    self.grad += 1.0 * out.grad
    other.grad += 1.0 * out.grad
  out._backward = _backward
  
  return out

def __mul__(self, other):
  ...    
  def _backward():
    self.grad += other.data * out.grad
    other.grad += self.data * out.grad
  out._backward = _backward
    
  return out
  • Accumulate grads
    use += instead of =
  • Topological sort
    topo = []
    visited = set()
    def build_topo(v):
    if v not in visited:
      visited.add(v)
      for child in v._prev:
        build_topo(child)
      topo.append(v)
    build_topo(self)
    
  • Traversal the reversed list
    def backward(self):
    # Alwasy starts with 1.0  
    self.grad = 1.0
    for node in reversed(topo):
      node._backward()
    

4 Pytorch implementation

# Python by default uses Double
x1 = torch.Tensor([2.0]).double()                
# Force to get grad. for this variable
x1.requires_grad = True

# Get the value by call .item()
print('x1', x1.grad.item())

5 NN implementation

  • Neron
    def __call__(self, x):
    # w * x + b
    # Sum needs () for generators
    # Sum takes a 2nd param as starting point
    act = sum((wi*xi for wi, xi in zip(self.w, x)), self.b)
    out = act.tanh()
    return out
    
  • Layer
    class Layer:
      
    def __init__(self, nin, nout):
      self.neurons = [Neuron(nin) for _ in range(nout)]
      
    def __call__(self, x):
      outs = [n(x) for n in self.neurons]
      return outs[0] if len(outs) == 1 else outs
    
  • MLP
class MLP:
  
  def __init__(self, nin, nouts):
    # nin is integer of input dim
    # nouts is List of layers
    sz = [nin] + nouts
    self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]
  
  def __call__(self, x):
    for layer in self.layers:
      x = layer(x)
    return x

6 Gradient Desceond Updates

  • find all the parameters from neurons
    # Only W and B in Neuron
    def parameters(self):
      return self.w + [self.b]
    # Two loops in list comprehension
    def parameters(self):
    return [p for neuron in self.neurons for p in neuron.parameters()]
    # Two loops in list comprehension
    def parameters(self):
    return [p for layer in self.layers for p in layer.parameters()]
    
  • Find the loss and backprop to get all the grads
# Get the predictions
ypred = [n(x) for x in xs]
# Square root loss
loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))

loss.backward()
  • GD updates
    #Just update the data by NEGATIVE learning rate x grad. 
    for p in n.parameters():
    p.data += -0.1 * p.grad
    
  • A common BUG!

Item 3 here! Alt text

# Pytorch zero_grad() 
for p in n.parameters():
  p.grad = 0.0
loss.backward()