w |
b |
Loss |
---|---|---|
0.50 | 0.00 | 0.0730 |
-0.10 | 0.00 | 0.14 |
0.94 | -0.94 | 0.0214 |
1.42 | -1.73 | 0.0028 |
1.65 | -2.08 | 0.0003 |
1.78 | -2.27 | 0.0000 |
w |
b |
Loss |
---|---|---|
0.50 | 0.00 | 0.0730 |
-0.10 | 0.00 | 0.14 |
0.94 | -0.94 | 0.0214 |
1.42 | -1.73 | 0.0028 |
1.65 | -2.08 | 0.0003 |
1.78 | -2.27 | 0.0000 |
[ \( \eta\) is typically small, so \( \eta^2,\eta^3 \cdots\rightarrow 0\)]
where, \( \nabla w_t = \frac{\partial \mathscr{L}(w,b)}{\partial w}\), at \(w=w_t,b=b_t \),
and, \( \nabla b_t = \frac{\partial \mathscr{L}(w,b)}{\partial b}\), at \(w=w_t,b=b_t \),
max_iterations = 1000
w = random()
b = random()
while max_iterations:
w = w - eta*dw
b = b - eta*db
max_iterations -= 1
import numpy as np
X = [0.5,2.5]
Y = [0.2,0.9]
def f(x,w,b):
return 1/(1+np.exp(-(w*x+b)))
def error(w,b):
err = 0.0
for x,y in zip(X,Y):
fx = f(x,w,b)
err += (fx-y)**2
return 0.5*err
def grad_b(x,w,b,y):
fx = f(x,w,b)
return (fx-y)*fx*(1-fx)
def grad_w(x,w,b,y):
fx = f(x,w,b)
return (fx-y)*fx*(1-fx)*x
def do_gradient_descent():
w,b,eta,max_epochs = -2,-2,1.0,1000
for i in range(max_epochs):
dw,db = 0,0
for x,y in zip(X,Y):
dw += grad_w(x,w,b,y)
db += grad_b(x,w,b,y)
w = w - eta*dw
b = b - eta*db
import numpy as np
X = [0.5,2.5]
Y = [0.2,0.9]
def f(x,w,b):
return 1/(1+np.exp(-(w*x+b)))
def error(w,b):
err = 0.0
for x,y in zip(X,Y):
fx = f(x,w,b)
err += (fx-y)**2
return 0.5*err
def grad_b(x,w,b,y):
fx = f(x,w,b)
return (fx-y)*fx*(1-fx)
def grad_w(x,w,b,y):
fx = f(x,w,b)
return (fx-y)*fx*(1-fx)*x
def do_gradient_descent():
w,b,eta,max_epochs = -2,-2,1.0,1000
for i in range(max_epochs):
dw,db = 0,0
for x,y in zip(X,Y):
dw += grad_w(x,w,b,y)
db += grad_b(x,w,b,y)
w = w - eta*dw
b = b - eta*db
import numpy as np
X = [0.5,2.5]
Y = [0.2,0.9]
def f(x,w,b):
return 1/(1+np.exp(-(w*x+b)))
def error(w,b):
err = 0.0
for x,y in zip(X,Y):
fx = f(x,w,b)
err += (fx-y)**2
return 0.5*err
def grad_b(x,w,b,y):
fx = f(x,w,b)
return (fx-y)*fx*(1-fx)
def grad_w(x,w,b,y):
fx = f(x,w,b)
return (fx-y)*fx*(1-fx)*x
def do_gradient_descent():
w,b,eta,max_epochs = -4,-4,1.0,1000
for i in range(max_epochs):
dw,db = 0,0
for x,y in zip(X,Y):
dw += grad_w(x,w,b,y)
db += grad_b(x,w,b,y)
w = w - eta*dw
b = b - eta*db
with, \( u_{-1}=0\), \(w_0 = rand()\) and \(0 \leq \beta < 1\)
def do_mgd(max_epochs):
w,b,eta = -2,-2,1.0
prev_uw,prev_ub,beta = 0,0,0.9
for i in range(max_epochs):
dw,db = 0,0
for x,y in zip(X,Y):
dw += grad_w(w,b,x,y)
db += grad_b(w,b,x,y)
uw = beta*prev_uw+eta*dw
ub = beta*prev_ub+eta*db
w = w - vw
b = b - vb
prev_uw = uw
prev_ub = ub
\(w_0=-2, b_0=-4, \eta=1 \)
with, \( u_{-1}=0\) and \(0 \leq \beta < 1\)
def do_nag(max_epochs):
w,b,eta = -2,-2,1.0
prev_vw,prev_vb,beta = 0,0,0.9
for i in range(max_epochs):
dw,db = 0,0
# do partial updates
v_w = beta*prev_vw
v_b = beta*prev_vb
for x,y in zip(X,Y):
# Look ahead
dw += grad_w(w-v_w,b-v_b,x,y)
db += grad_b(w-v_w,b-v_b,x,y)
vw = beta*prev_vw+eta*dw
vb = beta*prev_vb+eta*db
w = w - vw
b = b - vb
prev_vw = vw
prev_vb = vb
@Mitesh sir, will be used upon your approval
import numpy as np
X = [0.5,2.5]
Y = [0.2,0.9]
def do_gradient_descent():
w,b,eta,max_epochs = -2,-2,1.0,1000
for i in range(max_epochs):
dw,db = 0,0
for x,y in zip(X,Y):
dw += grad_w(x,w,b,y)
db += grad_b(x,w,b,y)
w = w - eta*dw
b = b - eta*db
def do_stochastic_gradient_descent():
w,b,eta,max_epochs = -2,-2,1.0,1000
for i in range(max_epochs):
dw,db = 0,0
for x,y in zip(X,Y):
dw += grad_w(x,w,b,y)
db += grad_b(x,w,b,y)
w = w - eta*dw
b = b - eta*db
def do_gradient_descent():
w,b,eta,max_epochs = -2,-2,1.0,1000
for i in range(max_epochs):
dw,db = 0,0
for x,y in zip(X,Y):
dw += grad_w(x,w,b,y)
db += grad_b(x,w,b,y)
w = w - eta*dw
b = b - eta*db
def do_stochastic_gradient_descent():
w,b,eta,max_epochs = -2,-2,1.0,500
for i in range(max_epochs):
dw,db = 0,0
for x,y in zip(X,Y):
dw += grad_w(x,w,b,y)
db += grad_b(x,w,b,y)
w = w - eta*dw
b = b - eta*db
def do_minibatch_stochastic_gradient_descent():
w,b,eta,max_epochs = -2,-2,1.0,500
mini_batch_size = 25
for i in range(max_epochs):
dw,db,num_points_seen = 0
for x,y in zip(X,Y):
dw += grad_w(x,w,b,y)
db += grad_b(x,w,b,y)
num_points_seen += 1
if num_points_seen%minibatch_size == 0:
w = w - eta*dw
b = b - eta*db
dw,db = 0,0
Algorithm | # of steps in 1 epoch |
---|---|
Vanilla (Batch) Gradient Descent | |
Stochastic Gradient Descent | |
Mini Batch Gradient Descent |
\(\beta =0.5\)
\(\beta=0.85\)
\( \beta_t = min(1-2^{-1-\log_2(\lfloor \frac{t}{250} \rfloor+1)},\beta_{max}) \)
def do_line_search_gradient_descent(max_epochs):
w,b,etas = -2,-2,[0.1,0.5,1,2,10]
for i in range(max_epochs):
dw,db = 0,0
for x,y in zip(X,Y):
dw += grad_w(w,b,x,y)
db += grad_b(w,b,x,y)
min_error = 10000 # some large value
for eta in etas:
temp_w = w - eta * dw
temp_b = b - eta * db
if error(temp_w,temp_b) < min_error:
best_w = temp_w
best_b = temp_b
min_error = error(best_w,best_b)
w = best_w
b = best_b
def do_line_search_gradient_descent(max_epochs):
w,b,etas = -2,-2,[0.1,0.5,1,2,10]
for i in range(max_epochs):
dw,db = 0,0
for x,y in zip(X,Y):
dw += grad_w(w,b,x,y)
db += grad_b(w,b,x,y)
min_error = 10000 # some large value
for eta in etas:
temp_w = w - eta * dw
temp_b = b - eta * db
if error(temp_w,temp_b) < min_error:
best_w = temp_w
best_b = temp_b
min_error = error(best_w,best_b)
w = best_w
b = best_b
def do_adagrad(max_epochs):
#Initialization
w,b,eta = -2,-2,0.1
v_w,v_b,eps = 0,0,1e-8
for i in range(max_epochs):
# zero grad
dw,db = 0,0
for x,y in zip(X,Y):
#compute the gradients
dw = grad_w(w,b,x,y)
db = grad_b(w,b,x,y)
#compute intermediate values
v_w = v_w + dw**2
v_b = v_b + db**2
#update parameters
w = w - eta*dw/(np.sqrt(v_w)+eps)
b =b - eta*db/(np.sqrt(v_b)+eps)
def do_adagrad(max_epochs):
#Initialization
w,b,eta = -2,-2,0.1
v_w,v_b,eps = 0,0,1e-8
for i in range(max_epochs):
# zero grad
dw,db = 0,0
for x,y in zip(X,Y):
#compute the gradients
dw = grad_w(w,b,x,y)
db = grad_b(w,b,x,y)
#compute intermediate values
v_w = v_w + dw**2
v_b = v_b + db**2
#update parameters
w = w - eta*dw/(np.sqrt(v_w)+eps)
b =b - eta*db/(np.sqrt(v_b)+eps)
def do_adagrad(max_epochs):
#Initialization
w,b,eta = -2,-2,0.1
v_w,v_b,eps = 0,0,1e-8
for i in range(max_epochs):
# zero grad
dw,db = 0,0
for x,y in zip(X,Y):
#compute the gradients
dw = grad_w(w,b,x,y)
db = grad_b(w,b,x,y)
#compute intermediate values
v_w = v_w + dw**2
v_b = v_b + db**2
#update parameters
w = w - eta*dw/(np.sqrt(v_w)+eps)
b =b - eta*db/(np.sqrt(v_b)+eps)
Adagrad - SGD (*New)
def do_rmsprop(max_epochs):
#Initialization
w,b,eta = -4,4,0.1
beta = 0.5
v_w,v_b,eps = 0,0,1e-4
for i in range(max_epochs):
# zero grad
dw,db = 0,0
for x,y in zip(X,Y):
#compute the gradients
dw = grad_w(w,b,x,y)
db = grad_b(w,b,x,y)
#compute intermediate values
v_w = beta*v_w +(1-beta)*dw**2
v_b = beta*v_b + (1-beta)*db**2
#update parameters
w = w - eta*dw/(np.sqrt(v_w)+eps)
b =b - eta*db/(np.sqrt(v_b)+eps)
RMSProp
AdaGrad
def do_adadelta(max_epochs):
#Initialization
w,b= -4,-4
beta = 0.99
v_w,v_b,eps = 0,0,1e-4
u_w,u_b = 0,0
for i in range(max_epochs):
dw,db = 0,0
for x,y in zip(X,Y):
#compute the gradients
dw += grad_w(w,b,x,y)
db += grad_b(w,b,x,y)
v_w = beta*v_w + (1-beta)*dw**2
v_b = beta*v_b + (1-beta)*db**2
delta_w = dw*np.sqrt(u_w+eps)/(np.sqrt(v_w+eps))
delta_b = db*np.sqrt(u_b+eps)/(np.sqrt(v_b+eps))
u_w = beta*u_w + (1-beta)*delta_w**2
u_b = beta*u_b + (1-beta)*delta_b**2
w = w - delta_w
b = b - delta_b
Incorporating classical
momentum
\(L_2\) norm
def do_adam_sgd(max_epochs):
#Initialization
w,b,eta = -4,-4,0.1
beta1,beta2 = 0.5,0.5
m_w,m_b,v_w,v_b = 0,0,0,0
for i in range(max_epochs):
dw,db = 0,0
eps = 1e-10
for x,y in zip(X,Y):
#compute the gradients
dw = grad_w_sgd(w,b,x,y)
db = grad_b_sgd(w,b,x,y)
#compute intermediate values
m_w = beta1*m_w+(1-beta1)*dw
m_b = beta1*m_b+(1-beta1)*db
v_w = beta2*v_w+(1-beta2)*dw**2
v_b = beta2*v_b+(1-beta2)*db**2
m_w_hat = m_w/(1-np.power(beta1,i+1))
m_b_hat = m_b/(1-np.power(beta1,i+1))
v_w_hat = v_w/(1-np.power(beta2,i+1))
v_b_hat = v_b/(1-np.power(beta2,i+1))
#update parameters
w = w - eta*m_w_hat/(np.sqrt(v_w_hat)+eps)
b = b - eta*m_b_hat/(np.sqrt(v_b_hat)+eps)
Replaced \(L_2\) norm by \(L_{max}\) norm in ADAM
Do the same modification for RMSprop and call it MaxaProp
def do_adamax_sgd(max_epochs):
#Initialization
w,b,eta = -4,-4,0.1
beta1,beta2 = 0.9,0.99
m_w,m_b,v_w,v_b = 0,0,0,0
m_w_hat,m_b_hat,v_w_hat,v_b_hat = 0,0,0,0
for i in range(max_epochs):
dw,db = 0,0
eps = 1e-10
for x,y in zip(X,Y):
#compute the gradients
dw += grad_w_sgd(w,b,x,y)
db += grad_b_sgd(w,b,x,y)
#compute intermediate values
m_w = beta1*m_w+(1-beta1)*dw
m_b = beta1*m_b+(1-beta1)*db
v_w = np.max([beta2*v_w,np.abs(dw)])
v_b = np.max([beta2*v_b,np.abs(db)])
m_w_hat = m_w/(1-np.power(beta1,i+1))
m_b_hat = m_b/(1-np.power(beta1,i+1))
#update parameters
w = w - eta*m_w_hat/(v_w+eps)
b = b - eta*m_b_hat/(v_b+eps)
def do_adamax_sgd(max_epochs):
#Initialization
w,b,eta = -4,-4,0.1
beta1,beta2 = 0.9,0.99
m_w_hat,m_b_hat,v_w_hat,v_b_hat = 0,0,0,0
for i in range(max_epochs):
dw,db = 0,0
eps = 1e-10
for x,y in zip(X,Y):
#compute the gradients
dw += grad_w_sgd(w,b,x,y)
db += grad_b_sgd(w,b,x,y)
#compute intermediate values
m_w = beta1*m_w+(1-beta1)*dw
m_b = beta1*m_b+(1-beta1)*db
v_w = beta2*v_w+(1-beta2)*dw**2
v_b = beta2*v_b+(1-beta2)*db**2
m_w_hat = m_w/(1-beta1**(i+1))
m_b_hat = m_b/(1-beta1**(i+1))
v_w_hat = v_w/(1-beta2**(i+1))
v_b_hat = v_b/(1-beta2**(i+1))
#update parameters
w = w - (eta/np.sqrt(v_w_hat+eps))* \\
(beta1*m_w_hat+(1-beta1)*dw/(1-beta1**(i+1)))
b = b - (eta/(np.sqrt(v_b_hat+eps)))*\\
(beta1*m_b_hat+(1-beta1)*db/(1-beta1**(i+1)))
-Pedro Domingos
@mitesh sir, we haven't introduced the concept of regularization so far in any of the previous lectures. So do we need to include AdamW here? (nevertheless, students from B.Sc program are introduced to regularization techniques in MLT course)
def cyclic_lr(iteration,max_lr,base_lr,step_size):
cycle = np.floor(1+iteration/(2*step_size))
x = np.abs(iteration/step_size - 2*cycle + 1)
lr = base_lr + (max_lr-base_lr)*np.maximum(0, (1-x))
return lr
def do_gradient_descent_clr(max_epochs):
w,b = -2,0
w_trace,b_trace = [w],[b]
for i in range(max_epochs):
dw,db = 0,0
dw = grad_w(w,b)
db = grad_b(w,b)
w = w - cyclic_lr(i,max_lr=0.1,base_lr=0.001,step_size=30) * dw
b = b - cyclic_lr(i,max_lr=0.1,base_lr=0.001,step_size=30) * db
return w_trace,b_trace
\(\eta_t=\eta_{max}\), for \(t=0\)
\(\eta_t=\eta_{min}\), for \(t=T\)
warmupSteps=4000