I tested some simple models on pytorch, Adai does have better performance over Adam, therefore, I tried to use Adai to fit bortfeld function. I implemented two matlab functions to compare the performance of Adai and Adam. And I found Adai cannot converge while Adam converged. Is my implementation wrong?
function [theta_best,loss] = adai(depth,para,idd_i,lb,ub,lr)
% adam inertia
T = 2000;
beta0 = 0.1;
beta1_cum_prod = 1;
beta2 = 0.99;
epsilon = 1e-3;
loss = zeros(T,1);
m_tm1 = 0;
v_tm1 = 0;
theta_tm1 = para;
v_t_mean = 0;
theta_best = para;
loss_best = 1e9;
loss(1) = norm((bf_mex(depth,theta_tm1,'idd') - idd_i),'fro');
for t = 2:T
% get gradient = jacobian*error
g_t = 2*bf_mex(depth,theta_tm1,'jacobian')'*(bf_mex(depth,theta_tm1,'idd') - idd_i);
% Update biased second raw moment estimate
v_t = beta2*v_tm1 + (1-beta2)*g_t.^2;
% Compute bias-corrected second raw moment estimate
v_t_hat = v_t / (1-beta2^(t-1));
v_t_mean = mean(v_t_hat);
beta1t = max(min(1-(v_t_hat./v_t_mean).*beta0, 1-epsilon),0);
% Update biased first moment estimate
m_t = beta1t.*m_tm1 + (1-beta1t).*g_t;
beta1_cum_prod = beta1_cum_prod.*beta1t;
% Compute bias-corrected first moment estimate
m_t_hat = m_t ./ (1-beta1_cum_prod);
% Update parameters
theta_t = theta_tm1 - lr*m_t_hat;
% constrain
theta_t(theta_t < lb) = lb(theta_t < lb);
theta_t(theta_t > ub) = ub(theta_t > ub);
theta_tm1 = theta_t;
m_tm1 = m_t;
v_tm1 = v_t;
idd_pred = bf_mex(depth,theta_t,'idd');
loss(t) = norm((idd_pred - idd_i),'fro');
if loss(t) < loss_best
loss_best = loss(t);
theta_best = theta_t;
end
if (abs(loss(t) - loss(t-1)) < 1e-6)
break;
end
end
end
function [theta_best,loss] = adam(depth,para,idd_i,lb,ub,lr)
T = 2000;
beta1 = 0.9;
beta2 = 0.999;
epsilon = 1e-8;
loss = zeros(T,1);
m_tm1 = 0;
v_tm1 = 0;
theta_tm1 = para;
theta_best = para;
loss_best = 1e9;
loss(1) = norm((bf_mex(depth,theta_tm1,'idd') - idd_i),'fro');
for t = 2:T
% get gradient = jacobian*error
g_t = 2*bf_mex(depth,theta_tm1,'jacobian')'*(bf_mex(depth,theta_tm1,'idd') - idd_i);
% Update biased first moment estimate
m_t = beta1*m_tm1 + (1-beta1)*g_t;
% Update biased second raw moment estimate
v_t = beta2*v_tm1 + (1-beta2)*g_t.^2;
% Compute bias-corrected first moment estimate
m_t_hat = m_t / (1-beta1^(t-1));
% Compute bias-corrected second raw moment estimate
v_t_hat = v_t / (1-beta2^(t-1));
% Update parameters
theta_t = theta_tm1 - lr*m_t_hat./(sqrt(v_t_hat)+epsilon);
% constrain
theta_t(theta_t < lb) = lb(theta_t < lb);
theta_t(theta_t > ub) = ub(theta_t > ub);
theta_tm1 = theta_t;
m_tm1 = m_t;
v_tm1 = v_t;
idd_pred = bf_mex(depth,theta_t,'idd');
loss(t) = norm((idd_pred - idd_i),'fro');
if loss(t) < loss_best
loss_best = loss(t);
theta_best = theta_t;
end
if (abs(loss(t) - loss(t-1)) < 1e-6)
break;
end
end
end