def do_adagrad(max_epochs):
#Initialization
w,b,eta = -2,-2,0.1
v_w,v_b,eps = 0,0,1e-8
for i in range(max_epochs):
# zero grad
dw,db = 0,0
for x,y in zip(X,Y):
#compute the gradients
dw = grad_w(w,b,x,y)
db = grad_b(w,b,x,y)
#compute intermediate values
v_w = v_w + dw**2
v_b = v_b + db**2
#update parameters
w = w - eta*dw/(np.sqrt(v_w)+eps)
b =b - eta*db/(np.sqrt(v_b)+eps)
def do_adagrad(max_epochs):
#Initialization
w,b,eta = -2,-2,0.1
v_w,v_b,eps = 0,0,1e-8
for i in range(max_epochs):
# zero grad
dw,db = 0,0
for x,y in zip(X,Y):
#compute the gradients
dw = grad_w(w,b,x,y)
db = grad_b(w,b,x,y)
#compute intermediate values
v_w = v_w + dw**2
v_b = v_b + db**2
#update parameters
w = w - eta*dw/(np.sqrt(v_w)+eps)
b =b - eta*db/(np.sqrt(v_b)+eps)
def do_adagrad(max_epochs):
#Initialization
w,b,eta = -2,-2,0.1
v_w,v_b,eps = 0,0,1e-8
for i in range(max_epochs):
# zero grad
dw,db = 0,0
for x,y in zip(X,Y):
#compute the gradients
dw += grad_w(w,b,x,y)
db += grad_b(w,b,x,y)
#compute intermediate values
v_w = v_w + dw**2
v_b = v_b + db**2
#update parameters
w = w - eta*dw/(np.sqrt(v_w)+eps)
b =b - eta*db/(np.sqrt(v_b)+eps)
def do_rmsprop(max_epochs):
#Initialization
w,b,eta = -4,4,0.1
beta = 0.5
v_w,v_b,eps = 0,0,1e-4
for i in range(max_epochs):
# zero grad
dw,db = 0,0
for x,y in zip(X,Y):
#compute the gradients
dw = grad_w(w,b,x,y)
db = grad_b(w,b,x,y)
#compute intermediate values
v_w = beta*v_w +(1-beta)*dw**2
v_b = beta*v_b + (1-beta)*db**2
#update parameters
w = w - eta*dw/(np.sqrt(v_w)+eps)
b =b - eta*db/(np.sqrt(v_b)+eps)
def do_adadelta(max_epochs):
#Initialization
w,b= -4,-4
beta = 0.99
v_w,v_b,eps = 0,0,1e-4
u_w,u_b = 0,0
for i in range(max_epochs):
dw,db = 0,0
for x,y in zip(X,Y):
#compute the gradients
dw += grad_w(w,b,x,y)
db += grad_b(w,b,x,y)
v_w = beta*v_w + (1-beta)*dw**2
v_b = beta*v_b + (1-beta)*db**2
delta_w = dw*np.sqrt(u_w+eps)/(np.sqrt(v_w+eps))
delta_b = db*np.sqrt(u_b+eps)/(np.sqrt(v_b+eps))
u_w = beta*u_w + (1-beta)*delta_w**2
u_b = beta*u_b + (1-beta)*delta_b**2
w = w - delta_w
b = b - delta_b
Incorporating classical
momentum
\(L^2\) norm
def do_adam_sgd(max_epochs):
#Initialization
w,b,eta = -4,-4,0.1
beta1,beta2 = 0.9,0.999
m_w,m_b,v_w,v_b = 0,0,0,0
for i in range(max_epochs):
dw,db = 0,0
eps = 1e-10
for x,y in zip(X,Y):
#compute the gradients
dw = grad_w_sgd(w,b,x,y)
db = grad_b_sgd(w,b,x,y)
#compute intermediate values
m_w = beta1*m_w+(1-beta1)*dw
m_b = beta1*m_b+(1-beta1)*db
v_w = beta2*v_w+(1-beta2)*dw**2
v_b = beta2*v_b+(1-beta2)*db**2
m_w_hat = m_w/(1-np.power(beta1,i+1))
m_b_hat = m_b/(1-np.power(beta1,i+1))
v_w_hat = v_w/(1-np.power(beta2,i+1))
v_b_hat = v_b/(1-np.power(beta2,i+1))
#update parameters
w = w - eta*m_w_hat/(np.sqrt(v_w_hat)+eps)
b = b - eta*m_b_hat/(np.sqrt(v_b_hat)+eps)
Suppose, \(\nabla w_0=0.1\)
Suppose, \(\nabla w_0=0.1\)
Adam
Adamax
def do_adamax_gd(max_epochs):
#Initialization
w,b,eta = -4,-4,0.1
beta1,beta2 = 0.9,0.99
m_w,m_b,v_w,v_b = 0,0,0,0
m_w_hat,m_b_hat,v_w_hat,v_b_hat = 0,0,0,0
for i in range(max_epochs):
dw,db = 0,0
eps = 1e-10
for x,y in zip(X,Y):
#compute the gradients
dw += grad_w_sgd(w,b,x,y)
db += grad_b_sgd(w,b,x,y)
#compute intermediate values
m_w = beta1*m_w+(1-beta1)*dw
m_b = beta1*m_b+(1-beta1)*db
v_w = np.max([beta2*v_w,np.abs(dw)])
v_b = np.max([beta2*v_b,np.abs(db)])
m_w_hat = m_w/(1-np.power(beta1,i+1))
m_b_hat = m_b/(1-np.power(beta1,i+1))
#update parameters
w = w - eta*m_w_hat/(v_w+eps)
b = b - eta*m_b_hat/(v_b+eps)
def do_adamax_sgd(max_epochs):
#Initialization
w,b,eta = -4,-4,0.1
beta1,beta2 = 0.9,0.99
m_w,m_b,v_w,v_b = 0,0,0,0
m_w_hat,m_b_hat,v_w_hat,v_b_hat = 0,0,0,0
for i in range(max_epochs):
dw,db = 0,0
eps = 1e-10
for x,y in zip(X,Y):
#compute the gradients
dw += grad_w_sgd(w,b,x,y)
db += grad_b_sgd(w,b,x,y)
#compute intermediate values
m_w = beta1*m_w+(1-beta1)*dw
m_b = beta1*m_b+(1-beta1)*db
v_w = np.max([beta2*v_w,np.abs(dw)])
v_b = np.max([beta2*v_b,np.abs(db)])
m_w_hat = m_w/(1-np.power(beta1,i+1))
m_b_hat = m_b/(1-np.power(beta1,i+1))
#update parameters
w = w - eta*m_w_hat/(v_w+eps)
b = b - eta*m_b_hat/(v_b+eps)
def do_adamax_sgd(max_epochs):
#Initialization
w,b,eta = -4,-4,0.1
beta1,beta2 = 0.9,0.99
m_w_hat,m_b_hat,v_w_hat,v_b_hat = 0,0,0,0
for i in range(max_epochs):
dw,db = 0,0
eps = 1e-10
for x,y in zip(X,Y):
#compute the gradients
dw += grad_w_sgd(w,b,x,y)
db += grad_b_sgd(w,b,x,y)
#compute intermediate values
m_w = beta1*m_w+(1-beta1)*dw
m_b = beta1*m_b+(1-beta1)*db
v_w = beta2*v_w+(1-beta2)*dw**2
v_b = beta2*v_b+(1-beta2)*db**2
m_w_hat = m_w/(1-beta1**(i+1))
m_b_hat = m_b/(1-beta1**(i+1))
v_w_hat = v_w/(1-beta2**(i+1))
v_b_hat = v_b/(1-beta2**(i+1))
#update parameters
w = w - (eta/np.sqrt(v_w_hat+eps))* \\
(beta1*m_w_hat+(1-beta1)*dw/(1-beta1**(i+1)))
b = b - (eta/(np.sqrt(v_b_hat+eps)))*\\
(beta1*m_b_hat+(1-beta1)*db/(1-beta1**(i+1)))
-Pedro Domingos
def cyclic_lr(iteration,max_lr,base_lr,step_size):
cycle = np.floor(1+iteration/(2*step_size))
x = np.abs(iteration/step_size - 2*cycle + 1)
lr = base_lr + (max_lr-base_lr)*np.maximum(0, (1-x))
return lr
def do_gradient_descent_clr(max_epochs):
w,b = -2,0.0001
for i in range(max_epochs):
dw,db = 0,0
dw = grad_w(w,b)
db = grad_b(w,b)
w = w - cyclic_lr(i,max_lr=0.1,base_lr=0.001,step_size=30) * dw
b = b - cyclic_lr(i,max_lr=0.1,base_lr=0.001,step_size=30) * db
\(\eta_t=\eta_{max}\), for \(t=0\)
\(\eta_t=\eta_{min}\), for \(t=T\)
warmupSteps=4000