6 #ifndef TURI_FACTORIZATION_GLM_SGD_INTERFACE_H_ 7 #define TURI_FACTORIZATION_GLM_SGD_INTERFACE_H_ 16 #include <core/util/code_optimization.hpp> 17 #include <core/util/fast_integer_power.hpp> 18 #include <toolkits/sgd/sgd_interface.hpp> 19 #include <toolkits/factorization/factorization_model_impl.hpp> 21 namespace turi {
namespace factorization {
29 enum class model_regularization_type {L2, ON_THE_FLY, NONE};
33 static inline T clip_1m1(T v) {
34 return (v < T(-1)) ? T(-1) : ( (v > T(1)) ? T(1) : v);
39 static inline T sqr(T v) {
return v * v; }
51 template <
typename GLMModel,
52 typename _LossModelProfile,
53 model_regularization_type _regularization_type>
63 , iteration_sample_count(0)
72 typedef _LossModelProfile LossModelProfile;
73 LossModelProfile loss_model;
75 typedef typename GLMModel::factor_type factor_type;
76 typedef typename GLMModel::factor_matrix_type factor_matrix_type;
77 typedef typename GLMModel::vector_type vector_type;
79 static constexpr model_factor_mode factor_mode = GLMModel::factor_mode;
80 static constexpr
flex_int num_factors_if_known = GLMModel::num_factors_if_known;
83 static constexpr
bool enable_item_locking =
true;
88 bool currently_in_trial_mode =
false;
92 static constexpr model_regularization_type regularization_type = _regularization_type;
95 bool nmf_mode =
false;
104 return ((factor_mode == model_factor_mode::pure_linear_model)
106 : ( (num_factors_if_known == Eigen::Dynamic)
107 ? model->num_factors()
108 : num_factors_if_known));
112 std::shared_ptr<GLMModel> model;
118 return model->n_total_dimensions;
127 switch(factor_mode) {
128 case model_factor_mode::pure_linear_model:
131 case model_factor_mode::matrix_factorization:
132 case model_factor_mode::factorization_machine:
133 return model->num_factor_dimensions;
143 double _lambda_w = NAN;
144 double _lambda_V = NAN;
146 size_t current_iteration = size_t(-1);
149 double current_iteration_step_size = 0;
152 size_t num_tempering_iterations = 0;
153 double tempering_regularization_start_value = 0;
155 double current_lambda_w(
size_t iteration)
const {
156 return _interpolate_reg_value(iteration, _lambda_w);
159 double current_lambda_V(
size_t iteration)
const {
160 return _interpolate_reg_value(iteration, _lambda_V);
163 double _interpolate_reg_value(
size_t iteration,
double lambda)
const {
164 if(iteration >= num_tempering_iterations)
171 if(currently_in_trial_mode && iteration != 0)
174 double end_reg = std::max(1e-12, lambda);
175 double begin_reg = tempering_regularization_start_value;
177 if(end_reg >= begin_reg)
182 double s = double(iteration) / num_tempering_iterations;
184 double ret = std::exp(std::log(begin_reg) * (1.0 - s) + std::log(end_reg) * s);
200 double s_w_factor = NAN, s_V_factor = NAN;
202 bool s_w_identically_1 =
true, s_V_identically_1 =
true;
203 std::atomic<size_t> iteration_sample_count;
209 double w_shrinkage = NAN, V_shrinkage = NAN;
210 vector_type on_the_fly__regularization_scaling_factors;
213 std::vector<simple_spinlock> item_locks;
215 size_t parameter_scaling_offset;
216 vector_type parameter_scaling;
224 size_t n_threads = 1;
226 struct sgd_processing_buffer {
239 float * __restrict__ V_row_ptr;
242 std::vector<variable> v;
244 factor_type xv_accumulator;
247 mutable std::vector<sgd_processing_buffer> buffers, alt_buffers;
251 bool adagrad_mode =
true;
254 volatile double w0_adagrad_g;
256 float adagrad_momentum_weighting = 1.0;
258 Eigen::Matrix<float, Eigen::Dynamic, num_factors_if_known, Eigen::RowMajor> V_adagrad_g;
259 Eigen::Matrix<float, Eigen::Dynamic, 1> w_adagrad_g;
268 void setup(
const v2::ml_data& train_data,
269 const std::map<std::string, flexible_type>& options) {
276 data_size = train_data.size();
278 _lambda_w = options.at(
"linear_regularization");
279 _lambda_V = options.at(
"regularization");
280 num_tempering_iterations = options.at(
"num_tempering_iterations");
281 num_tempering_iterations = std::min(num_tempering_iterations,
size_t(options.at(
"max_iterations")));
282 tempering_regularization_start_value = options.at(
"tempering_regularization_start_value");
284 nmf_mode = options.at(
"nmf");
286 adagrad_mode = (options.at(
"solver") ==
"adagrad");
289 adagrad_momentum_weighting = options.at(
"adagrad_momentum_weighting");
294 buffers.resize(n_threads);
295 alt_buffers.resize(n_threads);
297 size_t max_row_size = train_data.max_row_size();
300 for(std::vector<sgd_processing_buffer>* bv : {&buffers, &alt_buffers}) {
301 for(sgd_processing_buffer& buffer : (*bv)) {
303 buffer.v.resize(max_row_size);
305 for(
auto& var : buffer.v) {
306 var.V_row.resize(num_factors());
307 var.xV_row.resize(num_factors());
310 buffer.xv_accumulator.resize(num_factors());
320 s_w_identically_1 =
true;
324 s_V_identically_1 =
true;
326 iteration_sample_count = 0;
331 switch(regularization_type) {
332 case model_regularization_type::L2:
335 case model_regularization_type::ON_THE_FLY:
337 on_the_fly__regularization_scaling_factors.resize(n_total_dimensions());
340 for(
size_t c_idx = 0; c_idx < train_data.num_columns(); ++c_idx) {
341 for(
size_t i = 0; i < train_data.metadata()->index_size(c_idx); ++i, ++pos) {
342 on_the_fly__regularization_scaling_factors[pos]
343 = (train_data.metadata()->statistics(c_idx)->count(i)
344 / (std::max(
size_t(1), train_data.size())));
351 case model_regularization_type::NONE:
356 static constexpr
size_t ITEM_COLUMN_INDEX = 1;
358 if(enable_item_locking)
359 item_locks.resize(train_data.metadata()->index_size(ITEM_COLUMN_INDEX));
363 w_adagrad_g.resize(model->w.size());
364 V_adagrad_g.resize(model->V.rows(), num_factors());
377 return (regularization_type == model_regularization_type::L2
378 ? std::max(_lambda_w, _lambda_V)
386 switch(regularization_type) {
387 case model_regularization_type::L2:
388 case model_regularization_type::ON_THE_FLY:
393 return 0.9 / (1e-16 + std::max(current_lambda_w(0), current_lambda_V(0)));
395 case model_regularization_type::NONE:
397 return std::numeric_limits<double>::max();
406 current_iteration = iteration;
407 current_iteration_step_size = step_size;
432 switch(regularization_type) {
433 case model_regularization_type::L2: {
436 iteration_sample_count = 0;
438 double lambda_w = current_lambda_w(iteration);
439 double lambda_V = current_lambda_V(iteration);
449 double w_step_size = step_size;
450 double V_step_size = step_size;
453 w_step_size /= std::max(1.0, std::sqrt(
double(w_adagrad_g.mean())));
454 if(V_adagrad_g.rows() != 0)
455 V_step_size /= std::max(1.0, std::sqrt(
double(V_adagrad_g.mean())));
458 s_w_factor = 1.0 - w_step_size * lambda_w;
459 s_w_factor_pow.
set_base(s_w_factor);
460 s_w_identically_1 = (s_w_factor == 1.0);
462 s_V_factor = 1.0 - V_step_size * lambda_V;
463 s_V_factor_pow.
set_base(s_V_factor);
464 s_V_identically_1 = (s_V_factor == 1.0);
467 case model_regularization_type::ON_THE_FLY:
469 w_shrinkage = 1.0 - step_size * _lambda_w;
470 V_shrinkage = 1.0 - step_size * _lambda_V;
474 s_w_factor_pow.
set_base(s_w_factor);
475 s_w_identically_1 =
true;
478 s_V_factor_pow.
set_base(s_V_factor);
479 s_V_identically_1 =
true;
482 case model_regularization_type::NONE:
488 s_w_factor_pow.
set_base(s_w_factor);
489 s_w_identically_1 =
true;
492 s_V_factor_pow.
set_base(s_V_factor);
493 s_V_identically_1 =
true;
500 set_denormal_are_zero();
502 current_iteration = iteration;
512 if(adagrad_mode && adagrad_momentum_weighting != 1.0) {
513 float rho = adagrad_momentum_weighting;
515 if(w_adagrad_g.size() != 0)
516 w_adagrad_g.array() = rho * w_adagrad_g.array() + (1 - rho) * w_adagrad_g.mean();
518 if(V_adagrad_g.rows() != 0)
519 V_adagrad_g.array() = rho * V_adagrad_g.array() + (1 - rho) * V_adagrad_g.mean();
525 switch(regularization_type) {
527 case model_regularization_type::L2:
529 double n_samples_processed = iteration_sample_count;
530 double s_w = s_w_factor_pow.
pow(n_samples_processed);
531 double s_V = s_V_factor_pow.
pow(n_samples_processed);
535 if(!nmf_mode && s_w != 1.0) {
537 size_t start_w_idx = (thread_idx * n_total_dimensions()) / num_threads;
538 size_t end_w_idx = ((thread_idx + 1) * n_total_dimensions()) / num_threads;
540 for(
size_t i = start_w_idx; i < end_w_idx; ++i) {
544 if(std::fabs(model->w[i]) < 1e-16)
549 if(num_factor_dimensions() != 0 && s_V != 1.0) {
551 size_t start_V_idx = (thread_idx * num_factor_dimensions() ) / num_threads;
552 size_t end_V_idx = ((thread_idx + 1) * num_factor_dimensions() ) / num_threads;
554 for(
size_t i = start_V_idx; i < end_V_idx; ++i) {
555 for(
size_t j = 0; j < num_factors(); ++j) {
557 model->V(i,j) *= s_V;
559 if(std::fabs(model->V(i,j)) < 1e-16)
568 case model_regularization_type::ON_THE_FLY:
569 case model_regularization_type::NONE:
575 unset_denormal_are_zero();
581 if(!(std::isfinite(model->w0) && std::fabs(model->w0) <= 1e12))
588 for(
size_t i = 0; i < num_factor_dimensions(); ++i) {
589 if(model->V.row(i).sum() > 1e-16)
603 if(random_seed ==
size_t(-1))
604 random_seed = model->options.at(
"random_seed");
606 model->reset_state(random_seed, 0.001);
607 currently_in_trial_mode = trial_mode;
610 w_adagrad_g.setConstant(1e-16);
611 V_adagrad_g.setConstant(1e-16);
612 w0_adagrad_g = 1e-16;
624 double lambda_w = current_lambda_w(current_iteration);
625 double lambda_V = current_lambda_V(current_iteration);
627 if(regularization_type == model_regularization_type::NONE
628 || (lambda_w == 0 && lambda_V == 0) ) {
634 std::vector<double> accumulative_regularization_penalty(n_threads, 0);
638 size_t w_start_idx = (thread_idx * size_t(model->w.size())) / num_threads;
639 size_t w_end_idx = ((thread_idx + 1) *
size_t(model->w.size())) / num_threads;
641 size_t V_start_idx = (thread_idx * size_t(model->V.rows())) / num_threads;
642 size_t V_end_idx = ((thread_idx + 1) *
size_t(model->V.rows())) / num_threads;
644 if(regularization_type == model_regularization_type::ON_THE_FLY) {
647 for(
size_t i = w_start_idx; i < w_end_idx; ++i) {
649 accumulative_regularization_penalty[thread_idx]
651 * on_the_fly__regularization_scaling_factors[i]
652 * (model->w[i] * model->w[i]));
657 for(
size_t i = V_start_idx; i < V_end_idx; ++i) {
659 accumulative_regularization_penalty[thread_idx]
661 * on_the_fly__regularization_scaling_factors[i]
662 * model->V.row(i).squaredNorm());
669 accumulative_regularization_penalty[thread_idx] +=
670 lambda_w * model->w.segment(w_start_idx, w_end_idx - w_start_idx).squaredNorm();
674 accumulative_regularization_penalty[thread_idx] +=
675 lambda_V * model->V.block(V_start_idx, 0, V_end_idx - V_start_idx, num_factors()).squaredNorm();
681 double total_reg = std::accumulate(accumulative_regularization_penalty.begin(),
682 accumulative_regularization_penalty.end(),
697 volatile bool numerical_error_detected =
false;
699 in_parallel([&](
size_t thread_idx,
size_t num_threads) GL_GCC_ONLY(GL_HOT) {
701 std::vector<v2::ml_data_entry> x;
703 for(
auto it = data.get_iterator(thread_idx, num_threads);
704 !it.done() && !numerical_error_detected;
707 it.fill_observation(x);
709 double y = it.target_value();
711 double fx_pred = calculate_fx(thread_idx, x);
712 double point_loss = loss_model.loss(fx_pred, y);
714 if(!std::isfinite(point_loss)) {
715 numerical_error_detected =
true;
719 total_loss_accumulator[thread_idx] += point_loss;
724 if(numerical_error_detected)
727 double total_loss = std::accumulate(total_loss_accumulator.begin(),
728 total_loss_accumulator.end(), double(0));
730 size_t n = data.size();
731 double loss_value = (n != 0) ? (total_loss / n) : 0;
745 return loss_model.reported_loss_value(accumulative_loss);
755 return loss_model.reported_loss_name();
769 inline double _fill_buffer_calc_value(
770 sgd_processing_buffer& buffer,
771 const std::vector<v2::ml_data_entry>& x,
772 double l2_s_w_old,
double l2_s_V_old)
const GL_HOT {
774 const bool using_l2_regularization
775 = (regularization_type == model_regularization_type::L2);
777 const double s_w = using_l2_regularization ? l2_s_w_old : 1;
778 const double s_V = using_l2_regularization ? l2_s_V_old : 1;
781 const size_t x_size = x.size();
787 switch(factor_mode) {
792 case model_factor_mode::factorization_machine: {
802 buffer.xv_accumulator.setZero();
804 for(
size_t j = 0; j < x_size; ++j) {
806 auto& b = buffer.v[j];
810 b.index = global_idx;
812 b.w = model->w[global_idx];
815 double value_shift, value_scale;
816 std::tie(value_shift, value_scale) = model->column_shift_scales[global_idx];
818 b.xv = value_scale * (v.
value - value_shift);
821 b.V_row = model->V.row(global_idx);
822 b.xV_row = (s_V * b.xv) * b.V_row;
823 buffer.xv_accumulator += b.xV_row;
825 fx_value += (s_w * b.xv) * b.w;
838 for(
size_t j = 0; j < x.size(); ++j) {
839 const auto& b = buffer.v[j];
840 fx_delta += buffer.xv_accumulator.dot(b.xV_row) - b.xV_row.squaredNorm();
843 fx_value += 0.5*fx_delta;
853 buffer.w0 = model->w0;
854 fx_value += buffer.w0;
866 case model_factor_mode::matrix_factorization: {
874 DASSERT_GE(buffer.v.size(), x.size());
878 for(
size_t j : {1, 0}) {
880 auto& b = buffer.v[j];
885 const size_t global_idx = (j == 0 ? 0 : model->index_offsets[1]) + v.
index;
886 b.index = global_idx;
887 b.V_row = model->V.row(global_idx);
890 DASSERT_EQ(v.
value, 1);
894 fx_value += s_w * model->w[global_idx];
902 for(
size_t j = 2; j < x_size; ++j) {
904 auto& b = buffer.v[j];
908 b.index = global_idx;
911 double value_shift, value_scale;
912 std::tie(value_shift, value_scale) = model->column_shift_scales[global_idx];
914 b.xv = value_scale * (v.
value - value_shift);
916 fx_value += (s_w * b.xv) * model->w[global_idx];
923 fx_value += (s_V * s_V) * (buffer.v[0].V_row.dot(buffer.v[1].V_row));
931 buffer.w0 = model->w0;
932 fx_value += buffer.w0;
940 case model_factor_mode::pure_linear_model: {
942 DASSERT_GE(buffer.v.size(), x.size());
946 for(
size_t j = 0; j < x_size; ++j) {
948 auto& b = buffer.v[j];
952 b.index = global_idx;
954 b.w = model->w[global_idx];
957 double value_shift, value_scale;
958 std::tie(value_shift, value_scale) = model->column_shift_scales[global_idx];
960 b.xv = value_scale * (v.
value - value_shift);
962 fx_value += (s_w * b.xv) * b.w;
965 buffer.w0 = model->w0;
966 fx_value += buffer.w0;
979 struct _regularization_updates {
980 double s_w_old, s_w_new_inv;
981 double s_V_old, s_V_new_inv;
987 inline _regularization_updates _apply_regularization_update(
988 double step_size,
bool apply_regularization =
true)
991 switch(regularization_type) {
993 case model_regularization_type::L2: {
995 _regularization_updates ru;
997 size_t n = (apply_regularization
998 ? size_t(iteration_sample_count.fetch_add(1, std::memory_order_relaxed))
999 :
size_t(iteration_sample_count));
1001 if(s_w_identically_1) {
1005 ru.s_w_old = s_w_factor_pow.
pow(n);
1006 ru.s_w_new_inv = 1.0 / (ru.s_w_old * s_w_factor);
1009 if(s_V_identically_1) {
1013 ru.s_V_old = s_V_factor_pow.
pow(n);
1014 ru.s_V_new_inv = 1.0 / (ru.s_V_old * s_V_factor);
1019 case model_regularization_type::ON_THE_FLY:
1020 case model_regularization_type::NONE:
1022 return {1.0, 1.0, 1.0, 1.0};
1029 inline void _apply_w0_gradient(
1030 sgd_processing_buffer& buffer,
1039 if(std::is_same<LossModelProfile, loss_squared_error>::value)
1042 double delta = l_grad;
1045 double _wg = (w0_adagrad_g += (delta * delta));
1046 delta /= std::sqrt(_wg);
1049 model->w0 -= step_size * delta / n_threads;;
1052 atomic<size_t> hits;
1056 inline void _apply_w_V_gradient(
1057 sgd_processing_buffer& buffer,
double l_grad,
1058 double s_w_new_inv,
double s_V_new_inv,
1062 static constexpr
bool using_on_the_fly_regularization =
1063 (regularization_type == model_regularization_type::ON_THE_FLY);
1065 typedef volatile float * __restrict__ vfloat_ptr;
1067 float ss_scaling_factor = float(adagrad_mode ? sq(step_size / current_iteration_step_size) : 1.0);
1070 switch(factor_mode) {
1075 case model_factor_mode::factorization_machine: {
1078 for(
size_t j = 1; j < x_size; ++j)
1079 __builtin_prefetch(&(model->V(buffer.v[j].index)), 1, 1);
1082 for(
size_t j = 0; j < x_size; ++j) {
1084 auto& b = buffer.v[j];
1086 if(b.xv == 0)
continue;
1094 double w_grad = l_grad * b.xv;
1095 double step_w_scale = step_size;
1096 double step_V_scale = step_size;
1102 w_adagrad_g[b.index] += ss_scaling_factor * w_grad * w_grad;
1103 step_w_scale /= std::sqrt(w_adagrad_g[b.index]);
1107 model->w[b.index] -= clip_1m1(w_grad * step_w_scale) * s_w_new_inv;;
1109 if(using_on_the_fly_regularization) {
1110 model->w[b.index] *= w_shrinkage;
1120 b.xV_row = (l_grad * (buffer.xv_accumulator - b.xV_row));
1123 for(
size_t i = 0; i < num_factors(); ++i) {
1124 V_adagrad_g(b.index, i) += ss_scaling_factor * b.xV_row[i] * b.xV_row[i];
1125 b.xV_row[i] /= std::sqrt(V_adagrad_g(b.index, i));
1131 for(
size_t i = 0; i < num_factors(); ++i)
1132 b.V_row[i] -= clip_1m1(step_V_scale * b.xV_row[i]) * s_V_new_inv;
1134 if(using_on_the_fly_regularization)
1135 b.V_row *= V_shrinkage;
1137 for(
size_t i = 0; i < num_factors(); ++i) {
1142 model->V.row(b.index) = b.V_row;
1145 if(using_on_the_fly_regularization) {
1146 for(
size_t i = 0; i < num_factors(); ++i)
1147 model->V(b.index, i)
1148 = V_shrinkage * (b.V_row[i] - clip_1m1(step_V_scale * b.xV_row[i]) * s_V_new_inv);
1151 b.xV_row *= step_V_scale;
1152 b.xV_row = b.xV_row.cwiseMin(
float(1.0));
1153 b.xV_row = b.xV_row.cwiseMax(
float(-1.0));
1155 model->V.row(b.index) -= s_V_new_inv * b.xV_row;
1165 case model_factor_mode::matrix_factorization: {
1172 for(
size_t j = 0; j < x_size; ++j) {
1173 const auto& b = buffer.v[j];
1175 double w_delta = l_grad * b.xv;
1178 w_adagrad_g[b.index] += ss_scaling_factor * w_delta * w_delta;
1179 w_delta /= std::sqrt(w_adagrad_g[b.index]);
1182 vfloat_ptr w_ptr = (vfloat_ptr)(&(model->w[b.index]));
1186 if(using_on_the_fly_regularization)
1187 *w_ptr = w_shrinkage * ((*w_ptr) - (clip_1m1(step_size * w_delta) * s_w_new_inv));
1189 *w_ptr -= clip_1m1(step_size * w_delta) * s_w_new_inv;
1193 auto GL_GCC_ONLY(__restrict__)& b0 = buffer.v[0];
1194 auto GL_GCC_ONLY(__restrict__)& b1 = buffer.v[1];
1196 b0.xV_row = l_grad * b1.V_row;
1197 b1.xV_row = l_grad * b0.V_row;
1202 for(
size_t i = 0; i < num_factors(); ++i) {
1203 V_adagrad_g(b0.index, i) += ss_scaling_factor * b0.xV_row[i] * b0.xV_row[i];
1204 b0.xV_row[i] /= std::sqrt(V_adagrad_g(b0.index, i));
1207 for(
size_t i = 0; i < num_factors(); ++i) {
1208 V_adagrad_g(b1.index, i) += ss_scaling_factor * b1.xV_row[i] * b1.xV_row[i];
1209 b1.xV_row[i] /= std::sqrt(V_adagrad_g(b1.index, i));
1213 b0.xV_row *= step_size;
1214 b0.xV_row = b0.xV_row.cwiseMin(
float(1.0));
1215 b0.xV_row = b0.xV_row.cwiseMax(
float(-1.0));
1217 b1.xV_row *= step_size;
1218 b1.xV_row = b1.xV_row.cwiseMin(
float(1.0));
1219 b1.xV_row = b1.xV_row.cwiseMax(
float(-1.0));
1222 b0.V_row -= s_V_new_inv * b0.xV_row;
1223 b1.V_row -= s_V_new_inv * b1.xV_row;
1225 if(using_on_the_fly_regularization) {
1226 b0.V_row *= V_shrinkage;
1227 b1.V_row *= V_shrinkage;
1232 model->V.row(b0.index) = b0.V_row.cwiseMax(
float(0));
1233 model->V.row(b1.index) = b1.V_row.cwiseMax(
float(0));
1235 model->V.row(b0.index) = b0.V_row;
1236 model->V.row(b1.index) = b1.V_row;
1245 case model_factor_mode::pure_linear_model: {
1247 for(
size_t j = 0; j < x_size; ++j) {
1248 const auto& b = buffer.v[j];
1250 double w_delta = l_grad * b.xv;
1253 w_adagrad_g[b.index] += ss_scaling_factor * w_delta * w_delta;
1254 w_delta /= std::sqrt(w_adagrad_g[b.index]);
1257 vfloat_ptr w_ptr = (vfloat_ptr)(&(model->w[b.index]));
1261 if(using_on_the_fly_regularization)
1262 *w_ptr = w_shrinkage * ((*w_ptr) - (clip_1m1(step_size * w_delta) * s_w_new_inv));
1264 *w_ptr -= clip_1m1(step_size * w_delta) * s_w_new_inv;
1291 DASSERT_LT(thread_idx, buffers.size());
1293 sgd_processing_buffer& buffer = buffers[thread_idx];
1295 switch(regularization_type) {
1296 case model_regularization_type::L2: {
1297 size_t n = iteration_sample_count;
1304 s_w = s_w_factor_pow.
pow(n);
1305 s_V = s_V_factor_pow.
pow(n);
1308 return _fill_buffer_calc_value(buffer, x, s_w, s_V);
1310 case model_regularization_type::ON_THE_FLY:
1311 case model_regularization_type::NONE: {
1312 return _fill_buffer_calc_value(buffer, x, 1.0, 1.0);
1336 const std::vector<v2::ml_data_entry>& x,
1339 bool apply_regularization) {
1341 sgd_processing_buffer& buffer = buffers[thread_idx];
1343 static constexpr
size_t ITEM_COLUMN_INDEX = 1;
1351 auto ru = _apply_regularization_update(step_size, apply_regularization);
1359 std::unique_lock<simple_spinlock> item_lock(item_locks[x[ITEM_COLUMN_INDEX].index], std::defer_lock);
1361 if(enable_item_locking)
1364 double fx_value = _fill_buffer_calc_value(buffer, x, ru.s_w_old, ru.s_V_old);
1409 double l_grad = loss_model.loss_grad(fx_value, y);
1417 _apply_w0_gradient(buffer, l_grad, step_size);
1419 const size_t x_size = x.size();
1422 _apply_w_V_gradient(buffer, l_grad, ru.s_w_new_inv, ru.s_V_new_inv, x_size, step_size);
1424 if(enable_item_locking)
1428 asm volatile(
"" :::
"memory");
1433 double loss_value = loss_model.loss(fx_value, y);
1484 const std::vector<v2::ml_data_entry>& x,
1487 return apply_sgd_step(thread_idx, x, y, step_size,
true);
1507 const std::vector<v2::ml_data_entry>& x_positive,
1508 const std::vector<v2::ml_data_entry>& x_negative,
1510 sgd_processing_buffer& buffer_1 = buffers[thread_idx];
1511 sgd_processing_buffer& buffer_2 = alt_buffers[thread_idx];
1518 DASSERT_GE(buffer_1.v.size(), x_positive.size());
1519 DASSERT_GE(buffer_2.v.size(), x_negative.size());
1522 DASSERT_EQ(x_positive[0].index, x_negative[0].index);
1523 DASSERT_NE(x_positive[1].index, x_negative[1].index);
1525 auto s = _apply_regularization_update(step_size);
1527 double fx_diff_value = (_fill_buffer_calc_value(buffer_1, x_positive, s.s_w_old, s.s_V_old)
1528 - _fill_buffer_calc_value(buffer_2, x_negative, s.s_w_old, s.s_V_old));
1533 double l_grad = loss_model.loss_grad(fx_diff_value, 0);
1535 if(! (std::fabs(l_grad) < 1e-16) ) {
1540 _apply_w_V_gradient(buffer_1, l_grad, s.s_w_new_inv, s.s_V_new_inv,
1541 x_positive.size(), step_size);
1543 _apply_w_V_gradient(buffer_2, -l_grad, s.s_w_new_inv, s.s_V_new_inv,
1544 x_positive.size(), step_size);
1547 return loss_model.loss(fx_diff_value, 0);
GL_HOT_INLINE_FLATTEN double apply_sgd_step(size_t thread_idx, const std::vector< v2::ml_data_entry > &x, double y, double step_size)
double calculate_loss(const v2::ml_data &data) const
double calculate_fx(size_t thread_idx, const std::vector< v2::ml_data_entry > &x) const GL_HOT_FLATTEN
bool state_is_numerically_stable() const GL_HOT_INLINE_FLATTEN
double max_step_size() const
GL_HOT_INLINE_FLATTEN double apply_sgd_step(size_t thread_idx, const std::vector< v2::ml_data_entry > &x, double y, double step_size, bool apply_regularization)
void setup(const v2::ml_data &train_data, const std::map< std::string, flexible_type > &options)
double apply_pairwise_sgd_step(size_t thread_idx, const std::vector< v2::ml_data_entry > &x_positive, const std::vector< v2::ml_data_entry > &x_negative, double step_size) GL_HOT_FLATTEN
double l2_regularization_factor() const
static size_t cpu_count()
void finalize_iteration()
double reported_loss_value(double accumulative_loss) const
double pow(size_t b) const GL_HOT_INLINE_FLATTEN
void setup_optimization(size_t random_seed=size_t(-1), bool trial_mode=false)
std::string reported_loss_name() const
#define GL_HOT_INLINE_FLATTEN
void in_parallel(const std::function< void(size_t thread_id, size_t num_threads)> &fn)
double current_regularization_penalty() const
void setup_iteration(size_t iteration, double step_size)