# Adam
m = beta1*m + (1-beta1)*dx # update first moment
v = beta2*v + (1-beta2)*(dx**2) # update second moment
x += -learning_rate * m / (np.sqrt(v) + 1e-7)
# Adam
m,v = # ... initialize caches to zeros
for t in xrange(0, big_number):
dx = # ... evaluate gradient
m = beta1 * m + (1-beta1) * dx
v = beta2 * v + (1-beta2) * (dx**2)
m /= 1-beta1**t # correct bias
v /= 1-beta2**t # correct bias
x += -learning_rate * m / ( np.sqrt(v) + 1e-7)
# Adagrad update
cache += dx**2
x += -learning_rate * dx / ( np.sqrt(cache) + 1e-7 )
# RMSProp
cache = decay_rate * cache + ( 1 - decay_rate) * (dx**2)
x += -learning_rate * dx / ( np.sqrt(cache) + 1e-7 )