PLearn: AdaptGradientOptimizer.cc Source File

00001 // -*- C++ -*- 00002 00003 // PLearn (A C++ Machine Learning Library) 00004 // Copyright (C) 1998 Pascal Vincent 00005 // Copyright (C) 1999-2003 Pascal Vincent, Yoshua Bengio, 00006 // Olivier Delalleau and University of Montreal 00007 // 00008 00009 // Redistribution and use in source and binary forms, with or without 00010 // modification, are permitted provided that the following conditions are met: 00011 // 00012 // 1. Redistributions of source code must retain the above copyright 00013 // notice, this list of conditions and the following disclaimer. 00014 // 00015 // 2. Redistributions in binary form must reproduce the above copyright 00016 // notice, this list of conditions and the following disclaimer in the 00017 // documentation and/or other materials provided with the distribution. 00018 // 00019 // 3. The name of the authors may not be used to endorse or promote 00020 // products derived from this software without specific prior written 00021 // permission. 00022 // 00023 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00024 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00025 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00026 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00027 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00028 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00029 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00030 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00031 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00032 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00033 // 00034 // This file is part of the PLearn library. For more information on the PLearn 00035 // library, go to the PLearn Web site at www.plearn.org 00036 00037 00038 00039 00040 /* ******************************************************* 00041 * $Id: AdaptGradientOptimizer.cc,v 1.18 2004/07/21 16:30:54 chrish42 Exp $ 00042 * This file is part of the PLearn library. 00043 ******************************************************* */ 00044 00045 #include "AdaptGradientOptimizer.h" 00046 #include <plearn/var/SumOfVariable.h> 00047 00048 namespace PLearn { 00049 using namespace std; 00050 00051 AdaptGradientOptimizer::AdaptGradientOptimizer(real the_start_learning_rate, 00052 real the_decrease_constant, 00053 real the_min_learning_rate, 00054 real the_max_learning_rate, 00055 int the_learning_rate_adaptation, 00056 real the_adapt_coeff1, 00057 real the_adapt_coeff2, 00058 int n_updates, const string& filename, 00059 int every_iterations) 00060 :inherited(n_updates, filename, every_iterations), 00061 start_learning_rate(the_start_learning_rate), 00062 min_learning_rate(the_min_learning_rate), 00063 max_learning_rate(the_max_learning_rate), 00064 learning_rate_adaptation(the_learning_rate_adaptation), 00065 adapt_coeff1(the_adapt_coeff1), 00066 adapt_coeff2(the_adapt_coeff2), 00067 decrease_constant(the_decrease_constant), 00068 adapt_every(0) 00069 {} 00070 00071 AdaptGradientOptimizer::AdaptGradientOptimizer(VarArray the_params, Var the_cost, 00072 real the_start_learning_rate, 00073 real the_decrease_constant, 00074 real the_min_learning_rate, 00075 real the_max_learning_rate, 00076 int the_learning_rate_adaptation, 00077 real the_adapt_coeff1, 00078 real the_adapt_coeff2, 00079 int n_updates, const string& filename, 00080 int every_iterations) 00081 :inherited(the_params,the_cost, n_updates, filename, every_iterations), 00082 start_learning_rate(the_start_learning_rate), 00083 min_learning_rate(the_min_learning_rate), 00084 max_learning_rate(the_max_learning_rate), 00085 learning_rate_adaptation(the_learning_rate_adaptation), 00086 adapt_coeff1(the_adapt_coeff1), 00087 adapt_coeff2(the_adapt_coeff2), 00088 decrease_constant(the_decrease_constant) {} 00089 00090 AdaptGradientOptimizer::AdaptGradientOptimizer(VarArray the_params, Var the_cost, 00091 VarArray update_for_measure, 00092 real the_start_learning_rate, 00093 real the_decrease_constant, 00094 real the_min_learning_rate, 00095 real the_max_learning_rate, 00096 int the_learning_rate_adaptation, 00097 real the_adapt_coeff1, 00098 real the_adapt_coeff2, 00099 int n_updates, const string& filename, 00100 int every_iterations) 00101 :inherited(the_params,the_cost, update_for_measure, 00102 n_updates, filename, every_iterations), 00103 start_learning_rate(the_start_learning_rate), 00104 min_learning_rate(the_min_learning_rate), 00105 max_learning_rate(the_max_learning_rate), 00106 learning_rate_adaptation(the_learning_rate_adaptation), 00107 adapt_coeff1(the_adapt_coeff1), 00108 adapt_coeff2(the_adapt_coeff2), 00109 decrease_constant(the_decrease_constant) {} 00110 00111 00112 void AdaptGradientOptimizer::declareOptions(OptionList& ol) 00113 { 00114 declareOption(ol, "start_learning_rate", &AdaptGradientOptimizer::start_learning_rate, OptionBase::buildoption, 00115 " the initial learning rate\n"); 00116 00117 declareOption(ol, "min_learning_rate", &AdaptGradientOptimizer::min_learning_rate, OptionBase::buildoption, 00118 " the minimum value for the learning rate, when there is learning rate adaptation\n"); 00119 00120 declareOption(ol, "max_learning_rate", &AdaptGradientOptimizer::max_learning_rate, OptionBase::buildoption, 00121 " the maximum value for the learning rate, when there is learning rate adaptation\n"); 00122 00123 declareOption(ol, "adapt_coeff1", &AdaptGradientOptimizer::adapt_coeff1, OptionBase::buildoption, 00124 " a coefficient for learning rate adaptation, use may depend on the kind of adaptation\n"); 00125 00126 declareOption(ol, "adapt_coeff2", &AdaptGradientOptimizer::adapt_coeff2, OptionBase::buildoption, 00127 " a coefficient for learning rate adaptation, use may depend on the kind of adaptation\n"); 00128 00129 declareOption(ol, "decrease_constant", &AdaptGradientOptimizer::decrease_constant, OptionBase::buildoption, 00130 " the learning rate decrease constant : each update of the weights is scaled by the\n\ 00131 coefficient 1/(1 + stage * decrease_constant)\n"); 00132 00133 declareOption(ol, "learning_rate_adaptation", &AdaptGradientOptimizer::learning_rate_adaptation, OptionBase::buildoption, 00134 " the way the learning rates evolve :\n\ 00135 - 0 : no adaptation\n\ 00136 - 1 : basic adaptation :\n\ 00137 if the gradient of the weight i has the same sign for two consecutive epochs\n\ 00138 then lr(i) = lr(i) + lr(i) * adapt_coeff1\n\ 00139 else lr(i) = lr(i) - lr(i) * adapt_coeff2\n\ 00140 - 2 : ALAP1 formula. See code (not really tested)\n\ 00141 - 3 : variance-dependent learning rate :\n\ 00142 let avg(i) be the exponential average of the variance of the gradient of the weight i\n\ 00143 over the past epochs, where the coefficient for the exponential average is adapt_coeff1\n\ 00144 (adapt_coeff1 = 0 means no average)\n\ 00145 if avg(i) is low (ie < average of all avg(j))\n\ 00146 then lr(i) = max_learning_rate\n\ 00147 else lr(i) = min_learning_rate\n"); 00148 00149 declareOption(ol, "adapt_every", &AdaptGradientOptimizer::adapt_every, OptionBase::buildoption, 00150 " the learning rate adaptation will occur after adapt_every updates of the weights (0 means after each epoch)\n"); 00151 00152 inherited::declareOptions(ol); 00153 } 00154 00155 PLEARN_IMPLEMENT_OBJECT(AdaptGradientOptimizer, 00156 "An optimizer that performs gradient descent with learning rate adaptation.", 00157 "" 00158 ); 00159 00161 // build_ // 00163 void AdaptGradientOptimizer::build_(){ 00164 early_stop = false; 00165 count_updates = 0; 00166 learning_rate = start_learning_rate; 00167 SumOfVariable* sumofvar = dynamic_cast<SumOfVariable*>((Variable*)cost); 00168 stochastic_hack = sumofvar!=0 && sumofvar->nsamples==1; 00169 params.clearGradient(); 00170 int n = params.nelems(); 00171 if (n > 0) { 00172 store_var_grad.resize(n); 00173 store_var_grad.clear(); 00174 store_grad.resize(n); 00175 store_quad_grad.resize(n); 00176 store_grad.clear(); 00177 store_quad_grad.clear(); 00178 learning_rates.resize(n); 00179 gradient.resize(n); 00180 tmp_storage.resize(n); 00181 old_evol.resize(n); 00182 oldgradientlocations.resize(params.size()); 00183 learning_rates.fill(start_learning_rate); 00184 switch (learning_rate_adaptation) { 00185 case 0: 00186 break; 00187 case 1: 00188 // tmp_storage is used to store the old parameters 00189 params.copyTo(tmp_storage); 00190 old_evol.fill(0); 00191 break; 00192 case 2: 00193 // tmp_storage is used to store the initial opposite gradient 00194 Optimizer::computeOppositeGradient(this, tmp_storage); 00195 break; 00196 case 3: 00197 break; 00198 default: 00199 break; 00200 } 00201 } 00202 } 00203 00205 // adaptLearningRateALAP1 // 00207 void AdaptGradientOptimizer::adaptLearningRateALAP1( 00208 Vec old_gradient, 00209 Vec new_gradient) { 00210 int j = 0; // the current index in learning_rates 00211 real prod = 0; 00212 for (j = 0; j<params.nelems(); j++) { 00213 prod += old_gradient[j] * new_gradient[j]; 00214 } 00215 // The division by j=params.nelems() is a scaling coeff 00216 learning_rate = learning_rate + adapt_coeff1 * prod / real(j); 00217 if (learning_rate < min_learning_rate) { 00218 learning_rate = min_learning_rate; 00219 } else if (learning_rate > max_learning_rate) { 00220 learning_rate = max_learning_rate; 00221 } 00222 } 00223 00225 // adaptLearningRateBasic // 00227 void AdaptGradientOptimizer::adaptLearningRateBasic( 00228 Vec old_params, 00229 Vec old_evol) { 00230 Var* array = params->data(); 00231 int j = 0; 00232 int k; 00233 real u; // used to store old_evol[j] 00234 for (int i=0; i<params.size(); i++) { 00235 k = j; 00236 for (; j<k+array[i]->nelems(); j++) { 00237 u = old_evol[j]; 00238 real diff = array[i]->valuedata[j-k] - old_params[j]; 00239 if (diff > 0) { 00240 // the parameter has increased 00241 if (u > 0) { 00242 old_evol[j]++; 00243 } else { 00244 old_evol[j] = +1; 00245 } 00246 } else if (diff < 0) { 00247 // the parameter has decreased 00248 if (u < 0) { 00249 old_evol[j]--; 00250 } else { 00251 old_evol[j] = -1; 00252 } 00253 } else { 00254 // there has been no change 00255 old_evol[j] = 0; 00256 } 00257 if (u * old_evol[j] > 0) { 00258 // consecutive updates in the same direction 00259 learning_rates[j] += learning_rates[j] * adapt_coeff1; 00260 } 00261 else if (u * old_evol[j] < 0) { 00262 // oscillation 00263 learning_rates[j] -= learning_rates[j] * adapt_coeff2; 00264 } 00265 00266 if (learning_rates[j] < min_learning_rate) { 00267 learning_rates[j] = min_learning_rate; 00268 } else if (learning_rates[j] > max_learning_rate) { 00269 learning_rates[j] = max_learning_rate; 00270 } 00271 } 00272 } 00273 } 00274 00276 // adaptLearningRateVariance // 00278 void AdaptGradientOptimizer::adaptLearningRateVariance() { 00279 real moy_var = 0; 00280 real exp_avg_coeff = 0; 00281 if (stage > 1) { 00282 exp_avg_coeff = adapt_coeff1; 00283 } 00284 for (int j=0; j<params.nelems(); j++) { 00285 // Compute variance 00286 store_var_grad[j] = 00287 store_var_grad[j] * exp_avg_coeff + 00288 (store_quad_grad[j] - store_grad[j]*store_grad[j] / real(count_updates)) 00289 * (1 - exp_avg_coeff); 00290 moy_var += store_var_grad[j]; 00291 } 00292 count_updates = 0; 00293 store_quad_grad.clear(); 00294 store_grad.clear(); 00295 moy_var /= real(params.nelems()); 00296 int nb_low_var = 0, nb_high_var = 0; 00297 real var_limit = 1.0; 00298 for (int j=0; j<params.nelems(); j++) { 00299 if (store_var_grad[j] <= moy_var * var_limit) { 00300 learning_rates[j] = max_learning_rate; 00301 nb_low_var++; 00302 } else { 00303 learning_rates[j] = min_learning_rate; 00304 nb_high_var++; 00305 } 00306 } 00307 } 00308 00310 // optimize // 00312 real AdaptGradientOptimizer::optimize() 00313 { 00314 PLERROR("In AdaptGradientOptimizer::optimize Deprecated, use OptimizeN !"); 00315 return 0; 00316 } 00317 00319 // optimizeN // 00321 bool AdaptGradientOptimizer::optimizeN(VecStatsCollector& stats_coll) { 00322 00323 bool adapt = (learning_rate_adaptation != 0); 00324 stochastic_hack = stochastic_hack && !adapt; 00325 if (adapt_every == 0) { 00326 adapt_every = nstages; // the number of steps to complete an epoch 00327 } 00328 00329 // Big hack for the special case of stochastic gradient, to avoid doing an explicit update 00330 // (temporarily change the gradient fields of the parameters to point to the parameters themselves, 00331 // so that gradients are "accumulated" directly in the parameters, thus updating them! 00332 if(stochastic_hack) { 00333 int n = params.size(); 00334 for(int i=0; i<n; i++) 00335 oldgradientlocations[i] = params[i]->defineGradientLocation(params[i]->matValue); 00336 } 00337 00338 int stage_max = stage + nstages; // the stage to reach 00339 00340 for (; !early_stop && stage<stage_max; stage++) { 00341 00342 // Take into account the learning rate decrease 00343 // This is actually done during the update step, except when there is no 00344 // learning rate adaptation 00345 switch (learning_rate_adaptation) { 00346 case 0: 00347 learning_rate = start_learning_rate/(1.0+decrease_constant*stage); 00348 break; 00349 default: 00350 break; 00351 } 00352 00353 proppath.clearGradient(); 00354 if (adapt) 00355 cost->gradient[0] = -1.; 00356 else 00357 cost->gradient[0] = -learning_rate; 00358 00359 proppath.fbprop(); 00360 00361 // Actions to take after each step, depending on the 00362 // adaptation method used : 00363 // - moving along the chosen direction 00364 // - adapting the learning rate 00365 // - storing some data 00366 real coeff = 1/(1.0 + stage * decrease_constant); // the scaling cofficient 00367 switch (learning_rate_adaptation) { 00368 case 0: 00369 if (!stochastic_hack) { 00370 params.updateAndClear(); 00371 } 00372 break; 00373 case 1: 00374 params.copyGradientTo(gradient); 00375 // TODO Not really efficient, write a faster update ? 00376 params.update(learning_rates, gradient, coeff); 00377 params.clearGradient(); 00378 break; 00379 case 2: 00380 params.copyGradientTo(gradient); 00381 adaptLearningRateALAP1(tmp_storage, gradient); 00382 params.update(learning_rate, gradient); 00383 tmp_storage << gradient; 00384 params.clearGradient(); 00385 break; 00386 case 3: 00387 // storing sum and sum-of-squares of the gradient in order to compute 00388 // the variance later 00389 params.copyGradientTo(gradient); 00390 for (int i=0; i<params.nelems(); i++) { 00391 store_grad[i] += gradient[i]; 00392 store_quad_grad[i] += gradient[i] * gradient[i]; 00393 } 00394 count_updates++; 00395 params.update(learning_rates, gradient, coeff); 00396 params.clearGradient(); 00397 break; 00398 default: 00399 break; 00400 } 00401 00402 if ((stage + 1) % adapt_every == 0) { 00403 // Time for learning rate adaptation 00404 switch (learning_rate_adaptation) { 00405 case 0: 00406 break; 00407 case 1: 00408 adaptLearningRateBasic(tmp_storage, old_evol); 00409 params.copyTo(tmp_storage); 00410 break; 00411 case 2: 00412 // Nothing, the adaptation is after each example 00413 break; 00414 case 3: 00415 adaptLearningRateVariance(); 00416 break; 00417 default: 00418 break; 00419 } 00420 } 00421 00422 stats_coll.update(cost->value); 00423 } 00424 00425 if(stochastic_hack) // restore the gradients as they previously were... 00426 { 00427 int n = params.size(); 00428 for(int i=0; i<n; i++) 00429 params[i]->defineGradientLocation(oldgradientlocations[i]); 00430 } 00431 00432 if (early_stop) 00433 cout << "Early Stopping !" << endl; 00434 return early_stop; 00435 } 00436 00437 } // end of namespace PLearn