00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
#include "AdaptGradientOptimizer.h"
00046
#include <plearn/var/SumOfVariable.h>
00047
00048
namespace PLearn {
00049
using namespace std;
00050
00051 AdaptGradientOptimizer::AdaptGradientOptimizer(
real the_start_learning_rate,
00052
real the_decrease_constant,
00053
real the_min_learning_rate,
00054
real the_max_learning_rate,
00055
int the_learning_rate_adaptation,
00056
real the_adapt_coeff1,
00057
real the_adapt_coeff2,
00058
int n_updates,
const string& filename,
00059
int every_iterations)
00060 :
inherited(n_updates, filename, every_iterations),
00061 start_learning_rate(the_start_learning_rate),
00062 min_learning_rate(the_min_learning_rate),
00063 max_learning_rate(the_max_learning_rate),
00064 learning_rate_adaptation(the_learning_rate_adaptation),
00065 adapt_coeff1(the_adapt_coeff1),
00066 adapt_coeff2(the_adapt_coeff2),
00067 decrease_constant(the_decrease_constant),
00068 adapt_every(0)
00069 {}
00070
00071 AdaptGradientOptimizer::AdaptGradientOptimizer(
VarArray the_params,
Var the_cost,
00072
real the_start_learning_rate,
00073
real the_decrease_constant,
00074
real the_min_learning_rate,
00075
real the_max_learning_rate,
00076
int the_learning_rate_adaptation,
00077
real the_adapt_coeff1,
00078
real the_adapt_coeff2,
00079
int n_updates,
const string& filename,
00080
int every_iterations)
00081 :
inherited(the_params,the_cost, n_updates, filename, every_iterations),
00082 start_learning_rate(the_start_learning_rate),
00083 min_learning_rate(the_min_learning_rate),
00084 max_learning_rate(the_max_learning_rate),
00085 learning_rate_adaptation(the_learning_rate_adaptation),
00086 adapt_coeff1(the_adapt_coeff1),
00087 adapt_coeff2(the_adapt_coeff2),
00088 decrease_constant(the_decrease_constant) {}
00089
00090 AdaptGradientOptimizer::AdaptGradientOptimizer(
VarArray the_params,
Var the_cost,
00091
VarArray update_for_measure,
00092
real the_start_learning_rate,
00093
real the_decrease_constant,
00094
real the_min_learning_rate,
00095
real the_max_learning_rate,
00096
int the_learning_rate_adaptation,
00097
real the_adapt_coeff1,
00098
real the_adapt_coeff2,
00099
int n_updates,
const string& filename,
00100
int every_iterations)
00101 :
inherited(the_params,the_cost, update_for_measure,
00102 n_updates, filename, every_iterations),
00103 start_learning_rate(the_start_learning_rate),
00104 min_learning_rate(the_min_learning_rate),
00105 max_learning_rate(the_max_learning_rate),
00106 learning_rate_adaptation(the_learning_rate_adaptation),
00107 adapt_coeff1(the_adapt_coeff1),
00108 adapt_coeff2(the_adapt_coeff2),
00109 decrease_constant(the_decrease_constant) {}
00110
00111
00112 void AdaptGradientOptimizer::declareOptions(
OptionList& ol)
00113 {
00114
declareOption(ol,
"start_learning_rate", &AdaptGradientOptimizer::start_learning_rate, OptionBase::buildoption,
00115
" the initial learning rate\n");
00116
00117
declareOption(ol,
"min_learning_rate", &AdaptGradientOptimizer::min_learning_rate, OptionBase::buildoption,
00118
" the minimum value for the learning rate, when there is learning rate adaptation\n");
00119
00120
declareOption(ol,
"max_learning_rate", &AdaptGradientOptimizer::max_learning_rate, OptionBase::buildoption,
00121
" the maximum value for the learning rate, when there is learning rate adaptation\n");
00122
00123
declareOption(ol,
"adapt_coeff1", &AdaptGradientOptimizer::adapt_coeff1, OptionBase::buildoption,
00124
" a coefficient for learning rate adaptation, use may depend on the kind of adaptation\n");
00125
00126
declareOption(ol,
"adapt_coeff2", &AdaptGradientOptimizer::adapt_coeff2, OptionBase::buildoption,
00127
" a coefficient for learning rate adaptation, use may depend on the kind of adaptation\n");
00128
00129
declareOption(ol,
"decrease_constant", &AdaptGradientOptimizer::decrease_constant, OptionBase::buildoption,
00130
" the learning rate decrease constant : each update of the weights is scaled by the\n\
00131
coefficient 1/(1 + stage * decrease_constant)\n");
00132
00133
declareOption(ol,
"learning_rate_adaptation", &AdaptGradientOptimizer::learning_rate_adaptation, OptionBase::buildoption,
00134
" the way the learning rates evolve :\n\
00135
- 0 : no adaptation\n\
00136
- 1 : basic adaptation :\n\
00137
if the gradient of the weight i has the same sign for two consecutive epochs\n\
00138
then lr(i) = lr(i) + lr(i) * adapt_coeff1\n\
00139
else lr(i) = lr(i) - lr(i) * adapt_coeff2\n\
00140
- 2 : ALAP1 formula. See code (not really tested)\n\
00141
- 3 : variance-dependent learning rate :\n\
00142
let avg(i) be the exponential average of the variance of the gradient of the weight i\n\
00143
over the past epochs, where the coefficient for the exponential average is adapt_coeff1\n\
00144
(adapt_coeff1 = 0 means no average)\n\
00145
if avg(i) is low (ie < average of all avg(j))\n\
00146
then lr(i) = max_learning_rate\n\
00147
else lr(i) = min_learning_rate\n");
00148
00149
declareOption(ol,
"adapt_every", &AdaptGradientOptimizer::adapt_every, OptionBase::buildoption,
00150
" the learning rate adaptation will occur after adapt_every updates of the weights (0 means after each epoch)\n");
00151
00152 inherited::declareOptions(ol);
00153 }
00154
00155
PLEARN_IMPLEMENT_OBJECT(
AdaptGradientOptimizer,
00156
"An optimizer that performs gradient descent with learning rate adaptation.",
00157
""
00158 );
00159
00161
00163 void AdaptGradientOptimizer::build_(){
00164 early_stop =
false;
00165
count_updates = 0;
00166
learning_rate =
start_learning_rate;
00167
SumOfVariable* sumofvar = dynamic_cast<SumOfVariable*>((
Variable*)cost);
00168
stochastic_hack = sumofvar!=0 && sumofvar->
nsamples==1;
00169 params.
clearGradient();
00170
int n = params.
nelems();
00171
if (n > 0) {
00172
store_var_grad.
resize(n);
00173
store_var_grad.
clear();
00174
store_grad.
resize(n);
00175
store_quad_grad.
resize(n);
00176
store_grad.
clear();
00177
store_quad_grad.
clear();
00178
learning_rates.
resize(n);
00179
gradient.
resize(n);
00180
tmp_storage.
resize(n);
00181
old_evol.
resize(n);
00182
oldgradientlocations.
resize(params.
size());
00183
learning_rates.
fill(start_learning_rate);
00184
switch (
learning_rate_adaptation) {
00185
case 0:
00186
break;
00187
case 1:
00188
00189 params.
copyTo(
tmp_storage);
00190
old_evol.
fill(0);
00191
break;
00192
case 2:
00193
00194 Optimizer::computeOppositeGradient(
this,
tmp_storage);
00195
break;
00196
case 3:
00197
break;
00198
default:
00199
break;
00200 }
00201 }
00202 }
00203
00205
00207 void AdaptGradientOptimizer::adaptLearningRateALAP1(
00208
Vec old_gradient,
00209
Vec new_gradient) {
00210
int j = 0;
00211
real prod = 0;
00212
for (j = 0; j<params.
nelems(); j++) {
00213 prod += old_gradient[j] * new_gradient[j];
00214 }
00215
00216
learning_rate =
learning_rate +
adapt_coeff1 * prod /
real(j);
00217
if (
learning_rate <
min_learning_rate) {
00218
learning_rate =
min_learning_rate;
00219 }
else if (
learning_rate >
max_learning_rate) {
00220
learning_rate =
max_learning_rate;
00221 }
00222 }
00223
00225
00227 void AdaptGradientOptimizer::adaptLearningRateBasic(
00228
Vec old_params,
00229
Vec old_evol) {
00230
Var* array = params->
data();
00231
int j = 0;
00232
int k;
00233
real u;
00234
for (
int i=0; i<params.
size(); i++) {
00235
k = j;
00236
for (; j<
k+array[i]->nelems(); j++) {
00237 u = old_evol[j];
00238
real diff = array[i]->valuedata[j-
k] - old_params[j];
00239
if (diff > 0) {
00240
00241
if (u > 0) {
00242 old_evol[j]++;
00243 }
else {
00244 old_evol[j] = +1;
00245 }
00246 }
else if (diff < 0) {
00247
00248
if (u < 0) {
00249 old_evol[j]--;
00250 }
else {
00251 old_evol[j] = -1;
00252 }
00253 }
else {
00254
00255 old_evol[j] = 0;
00256 }
00257
if (u * old_evol[j] > 0) {
00258
00259
learning_rates[j] +=
learning_rates[j] *
adapt_coeff1;
00260 }
00261
else if (u * old_evol[j] < 0) {
00262
00263
learning_rates[j] -=
learning_rates[j] *
adapt_coeff2;
00264 }
00265
00266
if (
learning_rates[j] <
min_learning_rate) {
00267
learning_rates[j] =
min_learning_rate;
00268 }
else if (
learning_rates[j] >
max_learning_rate) {
00269
learning_rates[j] =
max_learning_rate;
00270 }
00271 }
00272 }
00273 }
00274
00276
00278 void AdaptGradientOptimizer::adaptLearningRateVariance() {
00279
real moy_var = 0;
00280
real exp_avg_coeff = 0;
00281
if (stage > 1) {
00282 exp_avg_coeff =
adapt_coeff1;
00283 }
00284
for (
int j=0; j<params.
nelems(); j++) {
00285
00286
store_var_grad[j] =
00287
store_var_grad[j] * exp_avg_coeff +
00288 (
store_quad_grad[j] -
store_grad[j]*store_grad[j] /
real(
count_updates))
00289 * (1 - exp_avg_coeff);
00290 moy_var += store_var_grad[j];
00291 }
00292
count_updates = 0;
00293
store_quad_grad.
clear();
00294
store_grad.
clear();
00295 moy_var /=
real(params.
nelems());
00296
int nb_low_var = 0, nb_high_var = 0;
00297
real var_limit = 1.0;
00298
for (
int j=0; j<params.
nelems(); j++) {
00299
if (
store_var_grad[j] <= moy_var * var_limit) {
00300
learning_rates[j] =
max_learning_rate;
00301 nb_low_var++;
00302 }
else {
00303
learning_rates[j] =
min_learning_rate;
00304 nb_high_var++;
00305 }
00306 }
00307 }
00308
00310
00312 real AdaptGradientOptimizer::optimize()
00313 {
00314
PLERROR(
"In AdaptGradientOptimizer::optimize Deprecated, use OptimizeN !");
00315
return 0;
00316 }
00317
00319
00321 bool AdaptGradientOptimizer::optimizeN(
VecStatsCollector& stats_coll) {
00322
00323
bool adapt = (
learning_rate_adaptation != 0);
00324
stochastic_hack =
stochastic_hack && !adapt;
00325
if (
adapt_every == 0) {
00326
adapt_every = nstages;
00327 }
00328
00329
00330
00331
00332
if(
stochastic_hack) {
00333
int n = params.
size();
00334
for(
int i=0; i<n; i++)
00335
oldgradientlocations[i] = params[i]->defineGradientLocation(params[i]->matValue);
00336 }
00337
00338
int stage_max = stage + nstages;
00339
00340
for (; !early_stop && stage<stage_max; stage++) {
00341
00342
00343
00344
00345
switch (
learning_rate_adaptation) {
00346
case 0:
00347
learning_rate =
start_learning_rate/(1.0+
decrease_constant*stage);
00348
break;
00349
default:
00350
break;
00351 }
00352
00353 proppath.
clearGradient();
00354
if (adapt)
00355 cost->gradient[0] = -1.;
00356
else
00357 cost->gradient[0] = -
learning_rate;
00358
00359 proppath.
fbprop();
00360
00361
00362
00363
00364
00365
00366
real coeff = 1/(1.0 + stage *
decrease_constant);
00367
switch (
learning_rate_adaptation) {
00368
case 0:
00369
if (!
stochastic_hack) {
00370 params.
updateAndClear();
00371 }
00372
break;
00373
case 1:
00374 params.
copyGradientTo(
gradient);
00375
00376 params.
update(
learning_rates,
gradient, coeff);
00377 params.
clearGradient();
00378
break;
00379
case 2:
00380 params.
copyGradientTo(
gradient);
00381
adaptLearningRateALAP1(
tmp_storage,
gradient);
00382 params.
update(
learning_rate,
gradient);
00383
tmp_storage <<
gradient;
00384 params.
clearGradient();
00385
break;
00386
case 3:
00387
00388
00389 params.
copyGradientTo(gradient);
00390
for (
int i=0; i<params.
nelems(); i++) {
00391
store_grad[i] += gradient[i];
00392
store_quad_grad[i] += gradient[i] * gradient[i];
00393 }
00394
count_updates++;
00395 params.
update(
learning_rates, gradient, coeff);
00396 params.
clearGradient();
00397
break;
00398
default:
00399
break;
00400 }
00401
00402
if ((stage + 1) %
adapt_every == 0) {
00403
00404
switch (
learning_rate_adaptation) {
00405
case 0:
00406
break;
00407
case 1:
00408
adaptLearningRateBasic(
tmp_storage,
old_evol);
00409 params.
copyTo(
tmp_storage);
00410
break;
00411
case 2:
00412
00413
break;
00414
case 3:
00415
adaptLearningRateVariance();
00416
break;
00417
default:
00418
break;
00419 }
00420 }
00421
00422 stats_coll.
update(cost->value);
00423 }
00424
00425
if(
stochastic_hack)
00426 {
00427
int n = params.
size();
00428
for(
int i=0; i<n; i++)
00429 params[i]->defineGradientLocation(
oldgradientlocations[i]);
00430 }
00431
00432
if (early_stop)
00433 cout <<
"Early Stopping !" <<
endl;
00434
return early_stop;
00435 }
00436
00437 }