00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00023 
00024 
00025 
00026 
00027 
00028 
00029 
00030 
00031 
00032 
00033 
00034 
00035 
00036 
00037  
00038 
00039 
00040 
00041 
00042 
00043 
00044 
#include "GradientOptimizer.h"
00045 
#include <plearn/math/TMat_maths.h>
00046 
#include <plearn/display/DisplayUtils.h>
00047 
#include <plearn/var/SumOfVariable.h>
00048 
00049 
namespace PLearn {
00050 
using namespace std;
00051 
00052 GradientOptimizer::GradientOptimizer(
real the_start_learning_rate, 
00053                                      
real the_decrease_constant,
00054                                      
int n_updates, 
const string& filename, 
00055                                      
int every_iterations)
00056   :
inherited(n_updates, filename, every_iterations),
00057    start_learning_rate(the_start_learning_rate),
00058    decrease_constant(the_decrease_constant) {}
00059 
00060 GradientOptimizer::GradientOptimizer(
VarArray the_params, 
Var the_cost,
00061                                      
real the_start_learning_rate, 
00062                                      
real the_decrease_constant,
00063                                      
int n_updates, 
const string& filename, 
00064                                      
int every_iterations)
00065   :
inherited(the_params,the_cost, n_updates, filename, every_iterations),
00066    start_learning_rate(the_start_learning_rate),
00067    decrease_constant(the_decrease_constant) {}
00068 
00069 GradientOptimizer::GradientOptimizer(
VarArray the_params, 
Var the_cost, 
00070                                      
VarArray update_for_measure,
00071                                      
real the_start_learning_rate, 
00072                                      
real the_decrease_constant,
00073                                      
int n_updates, 
const string& filename, 
00074                                      
int every_iterations)
00075   :
inherited(the_params,the_cost, update_for_measure,
00076              n_updates, filename, every_iterations),
00077   start_learning_rate(the_start_learning_rate),
00078   decrease_constant(the_decrease_constant) {}
00079 
00080 
00081 void GradientOptimizer::declareOptions(
OptionList& ol)
00082 {
00083     
declareOption(ol, 
"start_learning_rate", &GradientOptimizer::start_learning_rate, OptionBase::buildoption, 
00084                   
"    the initial learning rate\n");
00085 
00086     
declareOption(ol, 
"learning_rate", &GradientOptimizer::learning_rate, OptionBase::learntoption, 
00087                   
"    the current learning rate\n");
00088 
00089     
declareOption(ol, 
"decrease_constant", &GradientOptimizer::decrease_constant, OptionBase::buildoption, 
00090                   
"    the learning rate decrease constant \n");
00091 
00092     
declareOption(ol, 
"lr_schedule", &GradientOptimizer::lr_schedule, OptionBase::buildoption, 
00093                   
"Fixed schedule instead of decrease_constant. This matrix has 2 columns: iteration_threshold \n"
00094                   
"and learning_rate_factor. As soon as the iteration number goes above the iteration_threshold,\n"
00095                   
"the corresponding learning_rate_factor is applied (multiplied) to the start_learning_rate to\n"
00096                   
"obtain the learning_rate.\n");
00097 
00098     inherited::declareOptions(ol);
00099 }
00100 
00101   
00102 
00103 
00104 
00105 
00106 
00107 
00108 
00109 
00110 
00111 
00112 
00113 
00114 
00115 
00116 
00117 
00118 
00119 
00120 
00121 
00122 
00123 
PLEARN_IMPLEMENT_OBJECT(
GradientOptimizer, 
"Optimization by gradient descent.", 
00124     
"GradientOptimizer is the simple usual gradient descent algorithm \n"
00125     
" (the number of samples on which to estimate gradients before an \n"
00126     
"  update, which determines whether we are performing 'batch' \n"
00127     
"  'stochastic' or even 'minibatch', is currently specified outside \n"
00128     
"  this class, typically in the numer of s/amples of the meanOf function \n"
00129     
"  to be optimized, as its 'nsamples' parameter). \n"
00130     
"Options for GradientOptimizer are [ option_name: <type> (default) ]: \n"
00131     
"  - start_learning_rate: <real> (0.01) \n"
00132     
"    the initial learning rate \n"
00133     
"  - decrease_constant: <real> (0) \n"
00134     
"    the learning rate decrease constant \n"
00135     
"\n"
00136     
"GradientOptimizer derives form Optimizer. \n");
00137 
00138 static bool displayvg=
false;
00139 
00140 real GradientOptimizer::optimize()
00141 {
00142   ofstream out;
00143   
if (!filename.empty())
00144     {
00145      out.open(filename.c_str());
00146      out << 
" Stochastic! " << 
endl;
00147     }
00148   
Vec meancost(cost->size());
00149   
TVec<int> costnonmissing(cost->size());
00150   
Vec lastmeancost(cost->size());
00151   early_stop = 
false;
00152 
00153   
00154   
00155   
00156   
SumOfVariable* sumofvar = dynamic_cast<SumOfVariable*>((
Variable*)cost);
00157   
Array<Mat> oldgradientlocations;
00158   
00159   
bool stochastic_hack = sumofvar!=0 && sumofvar->
nsamples==1;
00160   
00161   
if(stochastic_hack)
00162   {
00163     
int n = params.
size();
00164     oldgradientlocations.
resize(n);
00165     
for(
int i=0; i<n; i++)
00166       oldgradientlocations[i] = params[i]->defineGradientLocation(params[i]->matValue);
00167   }
00168   
else
00169     params.
clearGradient();
00170 
00171   
00172   
for (
int t=0; !early_stop && t<nupdates; t++)
00173   {
00174     
learning_rate = 
start_learning_rate/(1.0+
decrease_constant*t);
00175 
00176     proppath.
clearGradient();
00177     cost->gradient[0] = -
learning_rate;
00178 
00179       proppath.
fbprop();
00180       
if (
displayvg || !finite(cost->value[0]))
00181         
displayVarGraph(proppath, 
true, 333);
00182       
addIfNonMissing(cost->value,costnonmissing,meancost);
00183       
if ((every!=0) && ((t+1)%every==0))
00184       
00185       { 
00186           
00187         
for (
int i=0;i<cost->size();i++)
00188           meancost[i] /= costnonmissing[i];
00189         
00190         
00191         cout << t+1 << 
' ' << meancost << 
' ' << 
learning_rate << 
endl;
00192         
if (out)
00193           out << t+1 << 
' ' << meancost << 
' ' << 
learning_rate << 
endl;
00194         early_stop = measure(t+1,meancost);
00195         early_stop_i = (t+1)/every;
00196         lastmeancost << meancost;
00197         meancost.
clear();
00198         costnonmissing.
clear();
00199       }
00200     
00201     
if(!stochastic_hack)
00202       params.
updateAndClear();
00203   }
00204 
00205   
if(stochastic_hack) 
00206     {
00207       
int n = params.
size();
00208       
for(
int i=0; i<n; i++)
00209         params[i]->defineGradientLocation(oldgradientlocations[i]);
00210     }
00211 
00212   
return lastmeancost[0];
00213 }
00214 
00215 bool GradientOptimizer::optimizeN(
VecStatsCollector& stats_coll) 
00216 {
00217   
00218   
00219   
00220   
SumOfVariable* sumofvar = dynamic_cast<SumOfVariable*>((
Variable*)cost);
00221   
Array<Mat> oldgradientlocations;
00222   
bool stochastic_hack = sumofvar!=0 && sumofvar->
nsamples==1;
00223   
00224   
if(stochastic_hack)
00225     
00226     
00227     
00228     
00229     {
00230       
int n = params.
size();
00231       oldgradientlocations.
resize(n);
00232       
for(
int i=0; i<n; i++)
00233         oldgradientlocations[i] = params[i]->defineGradientLocation(params[i]->matValue);
00234     }
00235   
else
00236     params.
clearGradient();
00237 
00238   
int stage_max = stage + nstages; 
00239 
00240   
int current_schedule = 0;
00241   
int n_schedules = 
lr_schedule.
length();
00242   
if (n_schedules>0)
00243     
while (current_schedule+1 < n_schedules && stage > 
lr_schedule(current_schedule,0)) current_schedule++;
00244   
while (stage < stage_max) 
00245     {
00246       
if (n_schedules>0)
00247         {
00248           
while (current_schedule+1 < n_schedules && stage > lr_schedule(current_schedule,0)) current_schedule++;
00249           
learning_rate = 
start_learning_rate * lr_schedule(current_schedule,1);
00250         }
00251       
else
00252         
learning_rate = 
start_learning_rate/(1.0+
decrease_constant*stage);
00253       proppath.
clearGradient();
00254       cost->gradient[0] = -
learning_rate;
00255       proppath.
fbprop(); 
00256 
#ifdef BOUNDCHECK
00257 
      int np = params.
size();
00258       
for(
int i=0; i<np; i++)
00259         
if (params[i]->value.
hasMissing())
00260           
PLERROR(
"parameter updated with NaN");
00261 
#endif
00262 
      static bool display_var_graph=
false;
00263       
if (display_var_graph)
00264         
displayVarGraph(proppath, 
true, 333);
00265 
00266 
00267 
00268 
00269 
00270 
00271 
00272 
00273       
00274       
if(!stochastic_hack)
00275         params.
updateAndClear();
00276 
00277       stats_coll.
update(cost->value);
00278       ++stage;
00279     }
00280 
00281   
if(stochastic_hack) 
00282     {
00283       
int n = params.
size();
00284       
for(
int i=0; i<n; i++)
00285         params[i]->defineGradientLocation(oldgradientlocations[i]);
00286     }
00287 
00288   
return false;
00289 }
00290 
00291 real ScaledGradientOptimizer::optimize()
00292 {
00293   ofstream out;
00294   
if (!filename.empty())
00295     out.open(filename.c_str());
00296 
00297   
eps_scale.
fill(1.0);
00298   
Vec first_long_time_mv; 
00299   
real best_cost = 1e30;
00300   
Vec prev_params(
gradient.
length());
00301   
Vec prev_gradient(
gradient.
length());
00302   
Vec best_params(
gradient.
length());
00303   
Vec best_gradient(
gradient.
length());
00304   params >> prev_params;
00305   params >> best_params;
00306   params.
copyGradientTo(prev_gradient);
00307   params.
copyGradientTo(best_gradient);
00308   
int n_long = (
int)(1.0/(
short_time_mac*
long_time_mac));
00309   cout << 
"start learning rate = " << 
start_learning_rate << 
endl;
00310   
learning_rate = 0;
00311   
Vec meancost(cost->size());
00312   
Vec lastmeancost(cost->size());
00313   early_stop = 
false;
00314   
for (
int t=0; !early_stop && t<nupdates; t++)
00315   {
00316     params.
clearGradient();
00317     proppath.
clearGradient();
00318     cost->gradient[0] = 1.0;
00319     proppath.
fbprop();
00320     
if (every!=0) 
00321     {
00322       
if ((t%every==0) && (t>0)) 
00323       {
00324         meancost /= 
real(every);      
00325         
if (meancost[0] > best_cost)
00326         {
00327           
start_learning_rate *= 0.5;
00328           params << best_params;
00329           params.
copyGradientFrom(best_gradient);
00330         }
00331         
else
00332         {
00333           best_cost = meancost[0];
00334           best_params << prev_params;
00335           best_gradient << prev_gradient;
00336           params >> prev_params;
00337           params.
copyGradientTo(prev_gradient);
00338           
start_learning_rate *= 1.1;
00339         }
00340         
learning_rate = 
start_learning_rate/(1.0+
decrease_constant*t);
00341         cout << t << 
' ' << meancost << 
' ' << 
learning_rate << 
endl;
00342         
if (out)
00343           out << t << 
' ' << meancost << 
' ' << 
learning_rate << 
endl;
00344         early_stop = measure(t,meancost);
00345         lastmeancost << meancost;
00346         meancost.
clear();
00347       }
00348       
else
00349       {
00350         
learning_rate = 
start_learning_rate/(1.0+
decrease_constant*t);
00351       }
00352     } 
00353     params.
copyGradientTo(
gradient);
00354     
if (t<n_long-1)
00355       
00356       
00357     {
00358       
long_time_ma += 
gradient;
00359       
squareAcc(
long_time_mv, gradient);
00360     }
00361     
else if (t==n_long-1) 
00362       
00363     {
00364       
long_time_ma *= 
real(1.0)/ (real)n_long;
00365       
long_time_mv *= real(1.0)/ (real)n_long;
00366       
squareMultiplyAcc(
long_time_mv, 
long_time_ma,(real)-1);
00367       first_long_time_mv << 
long_time_mv;
00368       
short_time_ma << 
long_time_ma;
00369     }
00370     
else 
00371       
00372     {
00373       
exponentialMovingAverageUpdate(
short_time_ma, 
gradient,
short_time_mac);
00374       
exponentialMovingAverageUpdate(
long_time_ma, 
short_time_ma,
long_time_mac);
00375       
exponentialMovingSquareUpdate(
long_time_mv, 
gradient,
long_time_mac);
00376       
if (t%n_long==0)
00377       {
00378         
real prev_eps = 0.5*(
max(
eps_scale)+
mean(
eps_scale));
00379         
00380         cout << 
"******* AT T= " << t << 
" *******" << 
endl;
00381         cout << 
"average gradient norm = " 
00382              << 
norm(
long_time_ma) << 
endl;
00383         cout << 
"average gradient = " << 
long_time_ma << 
endl;
00384         
00385         
Vec long_time_md = 
sqrt(
long_time_mv);
00386         cout << 
"sdev(gradient) = " << long_time_md << 
endl;
00387         cout << 
"mean(sdev(gradient)) = " << 
mean(long_time_md) << 
endl;
00388         
add(
long_time_mv,
regularizer,
eps_scale);
00389         
00390         
00391         cout << 
"eps_scale = " << 
eps_scale << 
endl;
00392         
real new_eps = 0.5*(
max(
eps_scale)+
mean(
eps_scale));
00393         
start_learning_rate *= prev_eps / new_eps;
00394         
learning_rate = 
start_learning_rate / (1 + 
decrease_constant*t);
00395         cout << 
"scale learning rate by " << prev_eps / new_eps << 
" to " << 
learning_rate << 
endl;
00396 
00397         
00398         
00399         
00400         
00401         
00402         
00403         
00404       }
00405     }
00406     
00407     meancost += cost->value;
00408     
gradient *= 
eps_scale;
00409     params.
update(-
learning_rate,
gradient);
00410   }
00411   
return meancost[0];
00412 }
00413 
00414 
00415 }