PLearn: GradientOptimizer.cc Source File

00001 // -*- C++ -*- 00002 00003 // PLearn (A C++ Machine Learning Library) 00004 // Copyright (C) 1998 Pascal Vincent 00005 // Copyright (C) 1999-2002 Pascal Vincent, Yoshua Bengio and University of Montreal 00006 // 00007 00008 // Redistribution and use in source and binary forms, with or without 00009 // modification, are permitted provided that the following conditions are met: 00010 // 00011 // 1. Redistributions of source code must retain the above copyright 00012 // notice, this list of conditions and the following disclaimer. 00013 // 00014 // 2. Redistributions in binary form must reproduce the above copyright 00015 // notice, this list of conditions and the following disclaimer in the 00016 // documentation and/or other materials provided with the distribution. 00017 // 00018 // 3. The name of the authors may not be used to endorse or promote 00019 // products derived from this software without specific prior written 00020 // permission. 00021 // 00022 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00023 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00024 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00025 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00026 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00027 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00028 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00029 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00030 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00031 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00032 // 00033 // This file is part of the PLearn library. For more information on the PLearn 00034 // library, go to the PLearn Web site at www.plearn.org 00035 00036 00037 00038 00039 /* ******************************************************* 00040 * $Id: GradientOptimizer.cc,v 1.32 2004/07/21 16:30:54 chrish42 Exp $ 00041 * This file is part of the PLearn library. 00042 ******************************************************* */ 00043 00044 #include "GradientOptimizer.h" 00045 #include <plearn/math/TMat_maths.h> 00046 #include <plearn/display/DisplayUtils.h> 00047 #include <plearn/var/SumOfVariable.h> 00048 00049 namespace PLearn { 00050 using namespace std; 00051 00052 GradientOptimizer::GradientOptimizer(real the_start_learning_rate, 00053 real the_decrease_constant, 00054 int n_updates, const string& filename, 00055 int every_iterations) 00056 :inherited(n_updates, filename, every_iterations), 00057 start_learning_rate(the_start_learning_rate), 00058 decrease_constant(the_decrease_constant) {} 00059 00060 GradientOptimizer::GradientOptimizer(VarArray the_params, Var the_cost, 00061 real the_start_learning_rate, 00062 real the_decrease_constant, 00063 int n_updates, const string& filename, 00064 int every_iterations) 00065 :inherited(the_params,the_cost, n_updates, filename, every_iterations), 00066 start_learning_rate(the_start_learning_rate), 00067 decrease_constant(the_decrease_constant) {} 00068 00069 GradientOptimizer::GradientOptimizer(VarArray the_params, Var the_cost, 00070 VarArray update_for_measure, 00071 real the_start_learning_rate, 00072 real the_decrease_constant, 00073 int n_updates, const string& filename, 00074 int every_iterations) 00075 :inherited(the_params,the_cost, update_for_measure, 00076 n_updates, filename, every_iterations), 00077 start_learning_rate(the_start_learning_rate), 00078 decrease_constant(the_decrease_constant) {} 00079 00080 00081 void GradientOptimizer::declareOptions(OptionList& ol) 00082 { 00083 declareOption(ol, "start_learning_rate", &GradientOptimizer::start_learning_rate, OptionBase::buildoption, 00084 " the initial learning rate\n"); 00085 00086 declareOption(ol, "learning_rate", &GradientOptimizer::learning_rate, OptionBase::learntoption, 00087 " the current learning rate\n"); 00088 00089 declareOption(ol, "decrease_constant", &GradientOptimizer::decrease_constant, OptionBase::buildoption, 00090 " the learning rate decrease constant \n"); 00091 00092 declareOption(ol, "lr_schedule", &GradientOptimizer::lr_schedule, OptionBase::buildoption, 00093 "Fixed schedule instead of decrease_constant. This matrix has 2 columns: iteration_threshold \n" 00094 "and learning_rate_factor. As soon as the iteration number goes above the iteration_threshold,\n" 00095 "the corresponding learning_rate_factor is applied (multiplied) to the start_learning_rate to\n" 00096 "obtain the learning_rate.\n"); 00097 00098 inherited::declareOptions(ol); 00099 } 00100 00101 /* 00102 void GradientOptimizer::oldwrite(ostream& out) const 00103 { 00104 writeHeader(out, "GradientOptimizer", 0); 00105 inherited::write(out); 00106 writeField(out, "start_learning_rate", start_learning_rate); 00107 writeField(out, "decrease_constant", decrease_constant); 00108 writeFooter(out, "GradientOptimizer"); 00109 } 00110 00111 void GradientOptimizer::oldread(istream& in) 00112 { 00113 int ver = readHeader(in, "GradientOptimizer"); 00114 if(ver!=0) 00115 PLERROR("In GradientOptimizer::read version number %d not supported",ver); 00116 inherited::oldread(in); 00117 readField(in, "start_learning_rate", start_learning_rate); 00118 readField(in, "decrease_constant", decrease_constant); 00119 readFooter(in, "GradientOptimizer"); 00120 } 00121 */ 00122 00123 PLEARN_IMPLEMENT_OBJECT(GradientOptimizer, "Optimization by gradient descent.", 00124 "GradientOptimizer is the simple usual gradient descent algorithm \n" 00125 " (the number of samples on which to estimate gradients before an \n" 00126 " update, which determines whether we are performing 'batch' \n" 00127 " 'stochastic' or even 'minibatch', is currently specified outside \n" 00128 " this class, typically in the numer of s/amples of the meanOf function \n" 00129 " to be optimized, as its 'nsamples' parameter). \n" 00130 "Options for GradientOptimizer are [ option_name: <type> (default) ]: \n" 00131 " - start_learning_rate: <real> (0.01) \n" 00132 " the initial learning rate \n" 00133 " - decrease_constant: <real> (0) \n" 00134 " the learning rate decrease constant \n" 00135 "\n" 00136 "GradientOptimizer derives form Optimizer. \n"); 00137 00138 static bool displayvg=false; 00139 00140 real GradientOptimizer::optimize() 00141 { 00142 ofstream out; 00143 if (!filename.empty()) 00144 { 00145 out.open(filename.c_str()); 00146 out << " Stochastic! " << endl; 00147 } 00148 Vec meancost(cost->size()); 00149 TVec<int> costnonmissing(cost->size()); 00150 Vec lastmeancost(cost->size()); 00151 early_stop = false; 00152 00153 // Big hack for the special case of stochastic gradient, to avoid doing an explicit update 00154 // (temporarily change the gradient fields of the parameters to point to the parameters themselves, 00155 // so that gradients are "accumulated" directly in the parameters, thus updating them! 00156 SumOfVariable* sumofvar = dynamic_cast<SumOfVariable*>((Variable*)cost); 00157 Array<Mat> oldgradientlocations; 00158 // bool stochastic_hack = false; 00159 bool stochastic_hack = sumofvar!=0 && sumofvar->nsamples==1; 00160 // stochastic_hack=false; 00161 if(stochastic_hack) 00162 { 00163 int n = params.size(); 00164 oldgradientlocations.resize(n); 00165 for(int i=0; i<n; i++) 00166 oldgradientlocations[i] = params[i]->defineGradientLocation(params[i]->matValue); 00167 } 00168 else 00169 params.clearGradient(); 00170 00171 // normally loop over number of epochs x training set size 00172 for (int t=0; !early_stop && t<nupdates; t++) 00173 { 00174 learning_rate = start_learning_rate/(1.0+decrease_constant*t); 00175 00176 proppath.clearGradient(); 00177 cost->gradient[0] = -learning_rate; 00178 00179 proppath.fbprop(); 00180 if (displayvg || !finite(cost->value[0])) 00181 displayVarGraph(proppath, true, 333); 00182 addIfNonMissing(cost->value,costnonmissing,meancost); 00183 if ((every!=0) && ((t+1)%every==0)) 00184 // normally this is done every epoch 00185 { 00186 //cerr << ">>>>>> nupdates= " << nupdates << " every=" << every << " sumofvar->nsamples=" << sumofvar->nsamples << endl; 00187 for (int i=0;i<cost->size();i++) 00188 meancost[i] /= costnonmissing[i]; 00189 //if (decrease_constant != 0) 00190 // cout << "at t=" << t << ", learning rate = " << learning_rate << endl; 00191 cout << t+1 << ' ' << meancost << ' ' << learning_rate << endl; 00192 if (out) 00193 out << t+1 << ' ' << meancost << ' ' << learning_rate << endl; 00194 early_stop = measure(t+1,meancost); 00195 early_stop_i = (t+1)/every; 00196 lastmeancost << meancost; 00197 meancost.clear(); 00198 costnonmissing.clear(); 00199 } 00200 // set params += -learning_rate * params.gradient 00201 if(!stochastic_hack) 00202 params.updateAndClear(); 00203 } 00204 00205 if(stochastic_hack) // restore the gradients as they previously were... 00206 { 00207 int n = params.size(); 00208 for(int i=0; i<n; i++) 00209 params[i]->defineGradientLocation(oldgradientlocations[i]); 00210 } 00211 00212 return lastmeancost[0]; 00213 } 00214 00215 bool GradientOptimizer::optimizeN(VecStatsCollector& stats_coll) 00216 { 00217 // Big hack for the special case of stochastic gradient, to avoid doing an explicit update 00218 // (temporarily change the gradient fields of the parameters to point to the parameters themselves, 00219 // so that gradients are "accumulated" directly in the parameters, thus updating them! 00220 SumOfVariable* sumofvar = dynamic_cast<SumOfVariable*>((Variable*)cost); 00221 Array<Mat> oldgradientlocations; 00222 bool stochastic_hack = sumofvar!=0 && sumofvar->nsamples==1; 00223 //stochastic_hack=false; 00224 if(stochastic_hack) 00225 // make the gradient and values fields of parameters point to the same place, 00226 // so that when the descendants of the parameter Var's do a bprop this 00227 // automatically increments the parameters (by the right amount since 00228 // we set the cost->gradient to -learning_rate). 00229 { 00230 int n = params.size(); 00231 oldgradientlocations.resize(n); 00232 for(int i=0; i<n; i++) 00233 oldgradientlocations[i] = params[i]->defineGradientLocation(params[i]->matValue); 00234 } 00235 else 00236 params.clearGradient(); 00237 00238 int stage_max = stage + nstages; // the stage to reach 00239 00240 int current_schedule = 0; 00241 int n_schedules = lr_schedule.length(); 00242 if (n_schedules>0) 00243 while (current_schedule+1 < n_schedules && stage > lr_schedule(current_schedule,0)) current_schedule++; 00244 while (stage < stage_max) 00245 { 00246 if (n_schedules>0) 00247 { 00248 while (current_schedule+1 < n_schedules && stage > lr_schedule(current_schedule,0)) current_schedule++; 00249 learning_rate = start_learning_rate * lr_schedule(current_schedule,1); 00250 } 00251 else 00252 learning_rate = start_learning_rate/(1.0+decrease_constant*stage); 00253 proppath.clearGradient(); 00254 cost->gradient[0] = -learning_rate; 00255 proppath.fbprop(); 00256 #ifdef BOUNDCHECK 00257 int np = params.size(); 00258 for(int i=0; i<np; i++) 00259 if (params[i]->value.hasMissing()) 00260 PLERROR("parameter updated with NaN"); 00261 #endif 00262 static bool display_var_graph=false; 00263 if (display_var_graph) 00264 displayVarGraph(proppath, true, 333); 00265 00266 // // Debugging of negative NLL bug... 00267 // if (cost->value[0] <= 0) { 00268 // displayVarGraph(proppath, true, 333); 00269 // cerr << "Negative NLL cost vector = " << cost << endl; 00270 // PLERROR("Negative NLL encountered in optimization"); 00271 // } 00272 00273 // set params += -learning_rate * params.gradient 00274 if(!stochastic_hack) 00275 params.updateAndClear(); 00276 00277 stats_coll.update(cost->value); 00278 ++stage; 00279 } 00280 00281 if(stochastic_hack) // restore the gradients as they previously were... 00282 { 00283 int n = params.size(); 00284 for(int i=0; i<n; i++) 00285 params[i]->defineGradientLocation(oldgradientlocations[i]); 00286 } 00287 00288 return false; 00289 } 00290 00291 real ScaledGradientOptimizer::optimize() 00292 { 00293 ofstream out; 00294 if (!filename.empty()) 00295 out.open(filename.c_str()); 00296 00297 eps_scale.fill(1.0); 00298 Vec first_long_time_mv; 00299 real best_cost = 1e30; 00300 Vec prev_params(gradient.length()); 00301 Vec prev_gradient(gradient.length()); 00302 Vec best_params(gradient.length()); 00303 Vec best_gradient(gradient.length()); 00304 params >> prev_params; 00305 params >> best_params; 00306 params.copyGradientTo(prev_gradient); 00307 params.copyGradientTo(best_gradient); 00308 int n_long = (int)(1.0/(short_time_mac*long_time_mac)); 00309 cout << "start learning rate = " << start_learning_rate << endl; 00310 learning_rate = 0; 00311 Vec meancost(cost->size()); 00312 Vec lastmeancost(cost->size()); 00313 early_stop = false; 00314 for (int t=0; !early_stop && t<nupdates; t++) 00315 { 00316 params.clearGradient(); 00317 proppath.clearGradient(); 00318 cost->gradient[0] = 1.0; 00319 proppath.fbprop(); 00320 if (every!=0) 00321 { 00322 if ((t%every==0) && (t>0)) 00323 { 00324 meancost /= real(every); 00325 if (meancost[0] > best_cost) 00326 { 00327 start_learning_rate *= 0.5; 00328 params << best_params; 00329 params.copyGradientFrom(best_gradient); 00330 } 00331 else 00332 { 00333 best_cost = meancost[0]; 00334 best_params << prev_params; 00335 best_gradient << prev_gradient; 00336 params >> prev_params; 00337 params.copyGradientTo(prev_gradient); 00338 start_learning_rate *= 1.1; 00339 } 00340 learning_rate = start_learning_rate/(1.0+decrease_constant*t); 00341 cout << t << ' ' << meancost << ' ' << learning_rate << endl; 00342 if (out) 00343 out << t << ' ' << meancost << ' ' << learning_rate << endl; 00344 early_stop = measure(t,meancost); 00345 lastmeancost << meancost; 00346 meancost.clear(); 00347 } 00348 else 00349 { 00350 learning_rate = start_learning_rate/(1.0+decrease_constant*t); 00351 } 00352 } 00353 params.copyGradientTo(gradient); 00354 if (t<n_long-1) 00355 // prepare to initialize the moving average 00356 // (by doing initially a batch average) 00357 { 00358 long_time_ma += gradient; 00359 squareAcc(long_time_mv, gradient); 00360 } 00361 else if (t==n_long-1) 00362 // prepare to initialize the moving averages 00363 { 00364 long_time_ma *= real(1.0)/ (real)n_long; 00365 long_time_mv *= real(1.0)/ (real)n_long; 00366 squareMultiplyAcc(long_time_mv, long_time_ma,(real)-1); 00367 first_long_time_mv << long_time_mv; 00368 short_time_ma << long_time_ma; 00369 } 00370 else 00371 // steady-state mode 00372 { 00373 exponentialMovingAverageUpdate(short_time_ma, gradient,short_time_mac); 00374 exponentialMovingAverageUpdate(long_time_ma, short_time_ma,long_time_mac); 00375 exponentialMovingSquareUpdate(long_time_mv, gradient,long_time_mac); 00376 if (t%n_long==0) 00377 { 00378 real prev_eps = 0.5*(max(eps_scale)+mean(eps_scale)); 00379 //apply(long_time_mv,long_time_md,sqrt); 00380 cout << "******* AT T= " << t << " *******" << endl; 00381 cout << "average gradient norm = " 00382 << norm(long_time_ma) << endl; 00383 cout << "average gradient = " << long_time_ma << endl; 00384 //cout << "short time average gradient = " << short_time_ma << endl; 00385 Vec long_time_md = sqrt(long_time_mv); 00386 cout << "sdev(gradient) = " << long_time_md << endl; 00387 cout << "mean(sdev(gradient)) = " << mean(long_time_md) << endl; 00388 add(long_time_mv,regularizer,eps_scale); 00389 //divide(1.0,long_time_mv,eps_scale); 00390 //divide(first_long_time_mv,long_time_mv,eps_scale); 00391 cout << "eps_scale = " << eps_scale << endl; 00392 real new_eps = 0.5*(max(eps_scale)+mean(eps_scale)); 00393 start_learning_rate *= prev_eps / new_eps; 00394 learning_rate = start_learning_rate / (1 + decrease_constant*t); 00395 cout << "scale learning rate by " << prev_eps / new_eps << " to " << learning_rate << endl; 00396 00397 //real *e=eps_scale.data(); 00398 //for (int i=0;i<eps_scale.length();i++) 00399 // if (e[i]>regularizer) e[i]=regularizer; 00400 //cout << "regularized eps_scale = " << eps_scale << endl; 00401 //cout << "avg/sdev) = " << long_time_md << endl; 00402 //eps_scale *= learning_rate; 00403 //cout << "regularized eps_scale * learning_rate = " << eps_scale << endl; 00404 } 00405 } 00406 // set params += -learning_rate * params.gradient 00407 meancost += cost->value; 00408 gradient *= eps_scale; 00409 params.update(-learning_rate,gradient); 00410 } 00411 return meancost[0]; 00412 } 00413 00414 00415 } // end of namespace PLearn