00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
#include "GradientOptimizer.h"
00045
#include <plearn/math/TMat_maths.h>
00046
#include <plearn/display/DisplayUtils.h>
00047
#include <plearn/var/SumOfVariable.h>
00048
00049
namespace PLearn {
00050
using namespace std;
00051
00052 GradientOptimizer::GradientOptimizer(
real the_start_learning_rate,
00053
real the_decrease_constant,
00054
int n_updates,
const string& filename,
00055
int every_iterations)
00056 :
inherited(n_updates, filename, every_iterations),
00057 start_learning_rate(the_start_learning_rate),
00058 decrease_constant(the_decrease_constant) {}
00059
00060 GradientOptimizer::GradientOptimizer(
VarArray the_params,
Var the_cost,
00061
real the_start_learning_rate,
00062
real the_decrease_constant,
00063
int n_updates,
const string& filename,
00064
int every_iterations)
00065 :
inherited(the_params,the_cost, n_updates, filename, every_iterations),
00066 start_learning_rate(the_start_learning_rate),
00067 decrease_constant(the_decrease_constant) {}
00068
00069 GradientOptimizer::GradientOptimizer(
VarArray the_params,
Var the_cost,
00070
VarArray update_for_measure,
00071
real the_start_learning_rate,
00072
real the_decrease_constant,
00073
int n_updates,
const string& filename,
00074
int every_iterations)
00075 :
inherited(the_params,the_cost, update_for_measure,
00076 n_updates, filename, every_iterations),
00077 start_learning_rate(the_start_learning_rate),
00078 decrease_constant(the_decrease_constant) {}
00079
00080
00081 void GradientOptimizer::declareOptions(
OptionList& ol)
00082 {
00083
declareOption(ol,
"start_learning_rate", &GradientOptimizer::start_learning_rate, OptionBase::buildoption,
00084
" the initial learning rate\n");
00085
00086
declareOption(ol,
"learning_rate", &GradientOptimizer::learning_rate, OptionBase::learntoption,
00087
" the current learning rate\n");
00088
00089
declareOption(ol,
"decrease_constant", &GradientOptimizer::decrease_constant, OptionBase::buildoption,
00090
" the learning rate decrease constant \n");
00091
00092
declareOption(ol,
"lr_schedule", &GradientOptimizer::lr_schedule, OptionBase::buildoption,
00093
"Fixed schedule instead of decrease_constant. This matrix has 2 columns: iteration_threshold \n"
00094
"and learning_rate_factor. As soon as the iteration number goes above the iteration_threshold,\n"
00095
"the corresponding learning_rate_factor is applied (multiplied) to the start_learning_rate to\n"
00096
"obtain the learning_rate.\n");
00097
00098 inherited::declareOptions(ol);
00099 }
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
PLEARN_IMPLEMENT_OBJECT(
GradientOptimizer,
"Optimization by gradient descent.",
00124
"GradientOptimizer is the simple usual gradient descent algorithm \n"
00125
" (the number of samples on which to estimate gradients before an \n"
00126
" update, which determines whether we are performing 'batch' \n"
00127
" 'stochastic' or even 'minibatch', is currently specified outside \n"
00128
" this class, typically in the numer of s/amples of the meanOf function \n"
00129
" to be optimized, as its 'nsamples' parameter). \n"
00130
"Options for GradientOptimizer are [ option_name: <type> (default) ]: \n"
00131
" - start_learning_rate: <real> (0.01) \n"
00132
" the initial learning rate \n"
00133
" - decrease_constant: <real> (0) \n"
00134
" the learning rate decrease constant \n"
00135
"\n"
00136
"GradientOptimizer derives form Optimizer. \n");
00137
00138 static bool displayvg=
false;
00139
00140 real GradientOptimizer::optimize()
00141 {
00142 ofstream out;
00143
if (!filename.empty())
00144 {
00145 out.open(filename.c_str());
00146 out <<
" Stochastic! " <<
endl;
00147 }
00148
Vec meancost(cost->size());
00149
TVec<int> costnonmissing(cost->size());
00150
Vec lastmeancost(cost->size());
00151 early_stop =
false;
00152
00153
00154
00155
00156
SumOfVariable* sumofvar = dynamic_cast<SumOfVariable*>((
Variable*)cost);
00157
Array<Mat> oldgradientlocations;
00158
00159
bool stochastic_hack = sumofvar!=0 && sumofvar->
nsamples==1;
00160
00161
if(stochastic_hack)
00162 {
00163
int n = params.
size();
00164 oldgradientlocations.
resize(n);
00165
for(
int i=0; i<n; i++)
00166 oldgradientlocations[i] = params[i]->defineGradientLocation(params[i]->matValue);
00167 }
00168
else
00169 params.
clearGradient();
00170
00171
00172
for (
int t=0; !early_stop && t<nupdates; t++)
00173 {
00174
learning_rate =
start_learning_rate/(1.0+
decrease_constant*t);
00175
00176 proppath.
clearGradient();
00177 cost->gradient[0] = -
learning_rate;
00178
00179 proppath.
fbprop();
00180
if (
displayvg || !finite(cost->value[0]))
00181
displayVarGraph(proppath,
true, 333);
00182
addIfNonMissing(cost->value,costnonmissing,meancost);
00183
if ((every!=0) && ((t+1)%every==0))
00184
00185 {
00186
00187
for (
int i=0;i<cost->size();i++)
00188 meancost[i] /= costnonmissing[i];
00189
00190
00191 cout << t+1 <<
' ' << meancost <<
' ' <<
learning_rate <<
endl;
00192
if (out)
00193 out << t+1 <<
' ' << meancost <<
' ' <<
learning_rate <<
endl;
00194 early_stop = measure(t+1,meancost);
00195 early_stop_i = (t+1)/every;
00196 lastmeancost << meancost;
00197 meancost.
clear();
00198 costnonmissing.
clear();
00199 }
00200
00201
if(!stochastic_hack)
00202 params.
updateAndClear();
00203 }
00204
00205
if(stochastic_hack)
00206 {
00207
int n = params.
size();
00208
for(
int i=0; i<n; i++)
00209 params[i]->defineGradientLocation(oldgradientlocations[i]);
00210 }
00211
00212
return lastmeancost[0];
00213 }
00214
00215 bool GradientOptimizer::optimizeN(
VecStatsCollector& stats_coll)
00216 {
00217
00218
00219
00220
SumOfVariable* sumofvar = dynamic_cast<SumOfVariable*>((
Variable*)cost);
00221
Array<Mat> oldgradientlocations;
00222
bool stochastic_hack = sumofvar!=0 && sumofvar->
nsamples==1;
00223
00224
if(stochastic_hack)
00225
00226
00227
00228
00229 {
00230
int n = params.
size();
00231 oldgradientlocations.
resize(n);
00232
for(
int i=0; i<n; i++)
00233 oldgradientlocations[i] = params[i]->defineGradientLocation(params[i]->matValue);
00234 }
00235
else
00236 params.
clearGradient();
00237
00238
int stage_max = stage + nstages;
00239
00240
int current_schedule = 0;
00241
int n_schedules =
lr_schedule.
length();
00242
if (n_schedules>0)
00243
while (current_schedule+1 < n_schedules && stage >
lr_schedule(current_schedule,0)) current_schedule++;
00244
while (stage < stage_max)
00245 {
00246
if (n_schedules>0)
00247 {
00248
while (current_schedule+1 < n_schedules && stage > lr_schedule(current_schedule,0)) current_schedule++;
00249
learning_rate =
start_learning_rate * lr_schedule(current_schedule,1);
00250 }
00251
else
00252
learning_rate =
start_learning_rate/(1.0+
decrease_constant*stage);
00253 proppath.
clearGradient();
00254 cost->gradient[0] = -
learning_rate;
00255 proppath.
fbprop();
00256
#ifdef BOUNDCHECK
00257
int np = params.
size();
00258
for(
int i=0; i<np; i++)
00259
if (params[i]->value.
hasMissing())
00260
PLERROR(
"parameter updated with NaN");
00261
#endif
00262
static bool display_var_graph=
false;
00263
if (display_var_graph)
00264
displayVarGraph(proppath,
true, 333);
00265
00266
00267
00268
00269
00270
00271
00272
00273
00274
if(!stochastic_hack)
00275 params.
updateAndClear();
00276
00277 stats_coll.
update(cost->value);
00278 ++stage;
00279 }
00280
00281
if(stochastic_hack)
00282 {
00283
int n = params.
size();
00284
for(
int i=0; i<n; i++)
00285 params[i]->defineGradientLocation(oldgradientlocations[i]);
00286 }
00287
00288
return false;
00289 }
00290
00291 real ScaledGradientOptimizer::optimize()
00292 {
00293 ofstream out;
00294
if (!filename.empty())
00295 out.open(filename.c_str());
00296
00297
eps_scale.
fill(1.0);
00298
Vec first_long_time_mv;
00299
real best_cost = 1e30;
00300
Vec prev_params(
gradient.
length());
00301
Vec prev_gradient(
gradient.
length());
00302
Vec best_params(
gradient.
length());
00303
Vec best_gradient(
gradient.
length());
00304 params >> prev_params;
00305 params >> best_params;
00306 params.
copyGradientTo(prev_gradient);
00307 params.
copyGradientTo(best_gradient);
00308
int n_long = (
int)(1.0/(
short_time_mac*
long_time_mac));
00309 cout <<
"start learning rate = " <<
start_learning_rate <<
endl;
00310
learning_rate = 0;
00311
Vec meancost(cost->size());
00312
Vec lastmeancost(cost->size());
00313 early_stop =
false;
00314
for (
int t=0; !early_stop && t<nupdates; t++)
00315 {
00316 params.
clearGradient();
00317 proppath.
clearGradient();
00318 cost->gradient[0] = 1.0;
00319 proppath.
fbprop();
00320
if (every!=0)
00321 {
00322
if ((t%every==0) && (t>0))
00323 {
00324 meancost /=
real(every);
00325
if (meancost[0] > best_cost)
00326 {
00327
start_learning_rate *= 0.5;
00328 params << best_params;
00329 params.
copyGradientFrom(best_gradient);
00330 }
00331
else
00332 {
00333 best_cost = meancost[0];
00334 best_params << prev_params;
00335 best_gradient << prev_gradient;
00336 params >> prev_params;
00337 params.
copyGradientTo(prev_gradient);
00338
start_learning_rate *= 1.1;
00339 }
00340
learning_rate =
start_learning_rate/(1.0+
decrease_constant*t);
00341 cout << t <<
' ' << meancost <<
' ' <<
learning_rate <<
endl;
00342
if (out)
00343 out << t <<
' ' << meancost <<
' ' <<
learning_rate <<
endl;
00344 early_stop = measure(t,meancost);
00345 lastmeancost << meancost;
00346 meancost.
clear();
00347 }
00348
else
00349 {
00350
learning_rate =
start_learning_rate/(1.0+
decrease_constant*t);
00351 }
00352 }
00353 params.
copyGradientTo(
gradient);
00354
if (t<n_long-1)
00355
00356
00357 {
00358
long_time_ma +=
gradient;
00359
squareAcc(
long_time_mv, gradient);
00360 }
00361
else if (t==n_long-1)
00362
00363 {
00364
long_time_ma *=
real(1.0)/ (real)n_long;
00365
long_time_mv *= real(1.0)/ (real)n_long;
00366
squareMultiplyAcc(
long_time_mv,
long_time_ma,(real)-1);
00367 first_long_time_mv <<
long_time_mv;
00368
short_time_ma <<
long_time_ma;
00369 }
00370
else
00371
00372 {
00373
exponentialMovingAverageUpdate(
short_time_ma,
gradient,
short_time_mac);
00374
exponentialMovingAverageUpdate(
long_time_ma,
short_time_ma,
long_time_mac);
00375
exponentialMovingSquareUpdate(
long_time_mv,
gradient,
long_time_mac);
00376
if (t%n_long==0)
00377 {
00378
real prev_eps = 0.5*(
max(
eps_scale)+
mean(
eps_scale));
00379
00380 cout <<
"******* AT T= " << t <<
" *******" <<
endl;
00381 cout <<
"average gradient norm = "
00382 <<
norm(
long_time_ma) <<
endl;
00383 cout <<
"average gradient = " <<
long_time_ma <<
endl;
00384
00385
Vec long_time_md =
sqrt(
long_time_mv);
00386 cout <<
"sdev(gradient) = " << long_time_md <<
endl;
00387 cout <<
"mean(sdev(gradient)) = " <<
mean(long_time_md) <<
endl;
00388
add(
long_time_mv,
regularizer,
eps_scale);
00389
00390
00391 cout <<
"eps_scale = " <<
eps_scale <<
endl;
00392
real new_eps = 0.5*(
max(
eps_scale)+
mean(
eps_scale));
00393
start_learning_rate *= prev_eps / new_eps;
00394
learning_rate =
start_learning_rate / (1 +
decrease_constant*t);
00395 cout <<
"scale learning rate by " << prev_eps / new_eps <<
" to " <<
learning_rate <<
endl;
00396
00397
00398
00399
00400
00401
00402
00403
00404 }
00405 }
00406
00407 meancost += cost->value;
00408
gradient *=
eps_scale;
00409 params.
update(-
learning_rate,
gradient);
00410 }
00411
return meancost[0];
00412 }
00413
00414
00415 }