PLearn: ConjGradientOptimizer.cc Source File

00001 // -*- C++ -*- 00002 00003 // PLearn (A C++ Machine Learning Library) 00004 // Copyright (C) 2003 Olivier Delalleau 00005 // 00006 00007 // Redistribution and use in source and binary forms, with or without 00008 // modification, are permitted provided that the following conditions are met: 00009 // 00010 // 1. Redistributions of source code must retain the above copyright 00011 // notice, this list of conditions and the following disclaimer. 00012 // 00013 // 2. Redistributions in binary form must reproduce the above copyright 00014 // notice, this list of conditions and the following disclaimer in the 00015 // documentation and/or other materials provided with the distribution. 00016 // 00017 // 3. The name of the authors may not be used to endorse or promote 00018 // products derived from this software without specific prior written 00019 // permission. 00020 // 00021 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00022 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00023 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00024 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00025 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00026 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00027 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00028 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00029 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00030 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00031 // 00032 // This file is part of the PLearn library. For more information on the PLearn 00033 // library, go to the PLearn Web site at www.plearn.org 00034 00035 00036 00037 00038 /* ******************************************************* 00039 * $Id: ConjGradientOptimizer.cc,v 1.55 2004/07/21 16:30:54 chrish42 Exp $ 00040 * This file is part of the PLearn library. 00041 ******************************************************* */ 00042 00043 #include "ConjGradientOptimizer.h" 00044 #include <plearn/math/TMat_maths_impl.h> 00045 00046 namespace PLearn { 00047 using namespace std; 00048 00049 // 00050 // Constructors 00051 // 00052 ConjGradientOptimizer::ConjGradientOptimizer( 00053 real the_starting_step_size, 00054 real the_restart_coeff, 00055 real the_epsilon, 00056 real the_sigma, 00057 real the_rho, 00058 real the_fmax, 00059 real the_stop_epsilon, 00060 real the_tau1, 00061 real the_tau2, 00062 real the_tau3, 00063 int n_updates, const string& filename, 00064 int every_iterations) 00065 :inherited(n_updates, filename, every_iterations), 00066 compute_cost(1), 00067 line_search_algo(1), 00068 find_new_direction_formula(1), 00069 starting_step_size(the_starting_step_size), restart_coeff(the_restart_coeff), 00070 epsilon(the_epsilon), 00071 sigma(the_sigma), rho(the_rho), fmax(the_fmax), 00072 stop_epsilon(the_stop_epsilon), tau1(the_tau1), tau2(the_tau2), 00073 tau3(the_tau3), max_steps(5), initial_step(0.01), low_enough(1e-6) {} 00074 00075 ConjGradientOptimizer::ConjGradientOptimizer( 00076 VarArray the_params, 00077 Var the_cost, 00078 real the_starting_step_size, 00079 real the_restart_coeff, 00080 real the_epsilon, 00081 real the_sigma, 00082 real the_rho, 00083 real the_fmax, 00084 real the_stop_epsilon, 00085 real the_tau1, 00086 real the_tau2, 00087 real the_tau3, 00088 int n_updates, const string& filename, 00089 int every_iterations) 00090 :inherited(the_params, the_cost, n_updates, filename, every_iterations), 00091 starting_step_size(the_starting_step_size), restart_coeff(the_restart_coeff), 00092 epsilon(the_epsilon), 00093 sigma(the_sigma), rho(the_rho), fmax(the_fmax), 00094 stop_epsilon(the_stop_epsilon), tau1(the_tau1), tau2(the_tau2), 00095 tau3(the_tau3) { 00096 cout << "Warning: you should use the constructor ConjGradientOptimizer(), or some default options may not be set properly" << endl; 00097 } 00098 00099 ConjGradientOptimizer::ConjGradientOptimizer( 00100 VarArray the_params, 00101 Var the_cost, 00102 VarArray the_update_for_measure, 00103 real the_starting_step_size, 00104 real the_restart_coeff, 00105 real the_epsilon, 00106 real the_sigma, 00107 real the_rho, 00108 real the_fmax, 00109 real the_stop_epsilon, 00110 real the_tau1, 00111 real the_tau2, 00112 real the_tau3, 00113 int n_updates, const string& filename, 00114 int every_iterations) 00115 :inherited(the_params, the_cost, the_update_for_measure, 00116 n_updates, filename, every_iterations), 00117 starting_step_size(the_starting_step_size), restart_coeff(the_restart_coeff), 00118 epsilon(the_epsilon), 00119 sigma(the_sigma), rho(the_rho), fmax(the_fmax), 00120 stop_epsilon(the_stop_epsilon), tau1(the_tau1), tau2(the_tau2), 00121 tau3(the_tau3) { 00122 cout << "Warning: you should use the constructor ConjGradientOptimizer(), or some default options may not be set properly" << endl; 00123 } 00124 00125 PLEARN_IMPLEMENT_OBJECT(ConjGradientOptimizer, 00126 "Optimizer based on the conjugate gradient method.", 00127 "The conjugate gradient algorithm is basically the following :\n" 00128 "- 0: initialize the search direction d = -gradient\n" 00129 "- 1: perform a line search along direction d for the minimum of the gradient\n" 00130 "- 2: move to this minimum, update the search direction d and go to step 1\n" 00131 "There are various methods available through the options for both steps 1 and 2." 00132 ); 00133 00134 // 00135 // declareOptions 00136 // 00137 void ConjGradientOptimizer::declareOptions(OptionList& ol) 00138 { 00139 declareOption(ol, "starting_step_size", &ConjGradientOptimizer::starting_step_size, OptionBase::buildoption, 00140 " The initial step size for the line search algorithm.\n \ 00141 This option has very little influence on the algorithm performance, \n \ 00142 since it is only used for the first iteration, and is set by the algorithm\n \ 00143 in the later stages.\n"); 00144 00145 declareOption(ol, "restart_coeff", &ConjGradientOptimizer::restart_coeff, OptionBase::buildoption, 00146 " The restart coefficient. A restart is triggered when the following condition is met :\n \ 00147 abs(gradient(k-1).gradient(k)) >= restart_coeff * gradient(k)^2\n \ 00148 If no restart is wanted, any high value (e.g. 100) will prevent it.\n"); 00149 00150 declareOption(ol, "line_search_algo", &ConjGradientOptimizer::line_search_algo, OptionBase::buildoption, 00151 " The kind of line search algorithm used :\n \ 00152 1. The line search algorithm described in :\n \ 00153 ""Practical Methods of Optimization, 2nd Ed""\n \ 00154 by Fletcher (1987).\n \ 00155 This algorithm seeks a minimum m of f(x)=C(x0+x.d) satisfying the two constraints :\n \ 00156 i. abs(f'(m)) < -sigma.f'(0)\n \ 00157 ii. f(m) < f(0) + m.rho.f'(0)\n \ 00158 with rho < sigma <= 1.\n \ 00159 2. The GSearch algorithm, described in :\n \ 00160 ""Direct Gradient-Based Reinforcement Learning: II. Gradient Ascent Algorithms and Experiments""\n \ 00161 by J.Baxter, L. Weaver, P. Bartlett (1999).\n \ 00162 3. A Newton line search algorithm for quadratic costs only.\n"); 00163 00164 declareOption(ol, "find_new_direction_formula", &ConjGradientOptimizer::find_new_direction_formula, OptionBase::buildoption, 00165 " The kind of formula used in step 2 of the conjugate gradient algorithm to find the\n \ 00166 new search direction :\n \ 00167 1. ConjPOMPD : the formula associated with GSearch in the same paper. It is\n \ 00168 almost the same as the Polak-Ribiere formula.\n \ 00169 2. Dai - Yuan\n \ 00170 3. Fletcher - Reeves\n \ 00171 4. Hestenes - Stiefel\n \ 00172 5. Polak - Ribiere : this is probably the most commonly used.\n \ 00173 The value of this option (from 1 to 5) indicates the formula used.\n"); 00174 00175 declareOption(ol, "epsilon", &ConjGradientOptimizer::epsilon, OptionBase::buildoption, 00176 " GSearch specific option : the gradient resolution.\n \ 00177 This small value is used in the GSearch algorithm instead of 0, to provide\n \ 00178 some robustness against errors in the estimate of the gradient, and to\n \ 00179 prevent the algorithm from looping indefinitely if there is no local minimum.\n \ 00180 Any small value should work, and the overall performance should not depend\n \ 00181 much on it.\n "); 00182 // TODO Check the last sentence is true ! 00183 00184 declareOption(ol, "sigma", &ConjGradientOptimizer::sigma, OptionBase::buildoption, 00185 " Fletcher's line search specific option : constraint parameter in i.\n"); 00186 00187 declareOption(ol, "rho", &ConjGradientOptimizer::rho, OptionBase::buildoption, 00188 " Fletcher's line search specific option : constraint parameter in ii.\n"); 00189 00190 declareOption(ol, "fmax", &ConjGradientOptimizer::fmax, OptionBase::buildoption, 00191 " Fletcher's line search specific option : good enough minimum.\n \ 00192 If it finds a point n such that f(n) < fmax, then the line search returns n as minimum\n\ 00193 As a consequence, DO NOT USE 0 if f can be negative\n"); 00194 00195 declareOption(ol, "stop_epsilon", &ConjGradientOptimizer::stop_epsilon, OptionBase::buildoption, 00196 " Fletcher's line search specific option : stopping criterium.\n \ 00197 This option allows the algorithm to detect when no improvement is possible along\n \ 00198 the search direction. It should be set to a small value, or we may stop too early.\n \ 00199 IMPORTANT: this same option is also used to compute the next step size. If you get\n \ 00200 a nan in the cost, try a higher stop_epsilon.\n"); 00201 00202 declareOption(ol, "tau1", &ConjGradientOptimizer::tau1, OptionBase::buildoption, 00203 " Fletcher's line search specific option : bracketing parameter.\n \ 00204 This option controls how fast is augmenting the bracketing interval in the first\n \ 00205 phase of the line search algorithm.\n \ 00206 Fletcher reports good empirical results with tau1 = 9.\n"); 00207 00208 declareOption(ol, "tau2", &ConjGradientOptimizer::tau2, OptionBase::buildoption, 00209 " Fletcher's line search specific option : bracketing parameter.\n \ 00210 This option controls how fast is augmenting the left side of the bracketing\n \ 00211 interval in the second phase of the line search algorithm.\n \ 00212 Fletcher reports good empirical results with tau2 = 0.1\n"); 00213 00214 declareOption(ol, "tau3", &ConjGradientOptimizer::tau3, OptionBase::buildoption, 00215 " Fletcher's line search specific option : bracketing parameter.\n \ 00216 This option controls how fast is decreasing the right side of the bracketing\n \ 00217 interval in the second phase of the line search algorithm.\n \ 00218 Fletcher reports good empirical results with tau3 = 0.5\n"); 00219 00220 declareOption(ol, "max_steps", &ConjGradientOptimizer::max_steps, OptionBase::buildoption, 00221 " Newton line search specific option : maximum number of steps at each iteration.\n \ 00222 This option defines how many steps will be performed to find the minimum, if no\n \ 00223 value < low_enough is found for the gradient (this can happen if the cost isn't\n \ 00224 perfectly quadratic).\n"); 00225 00226 declareOption(ol, "initial_step", &ConjGradientOptimizer::initial_step, OptionBase::buildoption, 00227 " Newton line search specific option : value of the first step.\n \ 00228 This options controls the size of the first step made in the search direction.\n"); 00229 00230 declareOption(ol, "low_enough", &ConjGradientOptimizer::low_enough, OptionBase::buildoption, 00231 " Newton line search specific option : stopping criterion for the gradient.\n \ 00232 We say the minimum has been found if we have abs(gradient) < low_enough.\n"); 00233 00234 declareOption(ol, "compute_cost", &ConjGradientOptimizer::compute_cost, OptionBase::buildoption, 00235 " If set to 1, will compute and display the mean cost at each epoch.\n"); 00236 00237 inherited::declareOptions(ol); 00238 } 00239 00240 00241 /****************************** 00242 * MAIN METHODS AND FUNCTIONS * 00243 ******************************/ 00244 00246 // build_ // 00248 void ConjGradientOptimizer::build_() { 00249 // Make sure the internal data have the right size. 00250 int n = params.nelems(); 00251 if (n > 0) { 00252 current_opp_gradient.resize(n); 00253 search_direction.resize(n); 00254 tmp_storage.resize(n); 00255 delta.resize(n); 00256 if (cost.length() > 0) { 00257 meancost.resize(cost->size()); 00258 } 00259 } 00260 } 00261 00263 // computeCostAndDerivative // 00265 void ConjGradientOptimizer::computeCostAndDerivative( 00266 real alpha, ConjGradientOptimizer* opt, real& cost, real& derivative) { 00267 if (alpha == 0) { 00268 cost = opt->last_cost; 00269 derivative = -dot(opt->search_direction, opt->current_opp_gradient); 00270 } else { 00271 // TODO See why different from computeDerivative. 00272 opt->params.copyTo(opt->tmp_storage); 00273 opt->params.update(alpha, opt->search_direction); 00274 opt->proppath.clearGradient(); 00275 opt->params.clearGradient(); 00276 opt->cost->gradient[0] = 1; 00277 opt->proppath.fbprop(); 00278 opt->params.copyGradientTo(opt->delta); 00279 cost = opt->cost->value[0]; 00280 derivative = dot(opt->search_direction, opt->delta); 00281 opt->params.copyFrom(opt->tmp_storage); 00282 } 00283 } 00284 00286 // computeCostValue // 00288 real ConjGradientOptimizer::computeCostValue( 00289 real alpha, 00290 ConjGradientOptimizer* opt) { 00291 if (alpha == 0) { 00292 return opt->last_cost; 00293 } 00294 opt->params.copyTo(opt->tmp_storage); 00295 opt->params.update(alpha, opt->search_direction); 00296 opt->proppath.fprop(); 00297 real c = opt->cost->value[0]; 00298 opt->params.copyFrom(opt->tmp_storage); 00299 return c; 00300 } 00301 00303 // computeDerivative // 00305 real ConjGradientOptimizer::computeDerivative( 00306 real alpha, 00307 ConjGradientOptimizer* opt) { 00308 if (alpha == 0) 00309 return -dot(opt->search_direction, opt->current_opp_gradient); 00310 opt->params.copyTo(opt->tmp_storage); 00311 opt->params.update(alpha, opt->search_direction); 00312 Optimizer::computeGradient(opt, opt->delta); 00313 opt->params.copyFrom(opt->tmp_storage); 00314 return dot(opt->search_direction, opt->delta); 00315 } 00316 00318 // conjpomdp // 00320 real ConjGradientOptimizer::conjpomdp ( 00321 void (*grad)(Optimizer*, const Vec& gradient), 00322 ConjGradientOptimizer* opt) { 00323 int i; 00324 // delta = Gradient 00325 real norm_g = pownorm(opt->current_opp_gradient); 00326 // tmp_storage <- delta - g (g = current_opp_gradient) 00327 for (i=0; i<opt->current_opp_gradient.length(); i++) { 00328 opt->tmp_storage[i] = opt->delta[i]-opt->current_opp_gradient[i]; 00329 } 00330 real gamma = dot(opt->tmp_storage, opt->delta) / norm_g; 00331 // h <- delta + gamma * h (h = search_direction) 00332 for (i=0; i<opt->search_direction.length(); i++) { 00333 opt->tmp_storage[i] = opt->delta[i] + gamma * opt->search_direction[i]; 00334 } 00335 if (dot(opt->tmp_storage, opt->delta) < 0) 00336 return 0; 00337 else 00338 return gamma; 00339 } 00340 00342 // cubicInterpol // 00344 void ConjGradientOptimizer::cubicInterpol( 00345 real f0, real f1, real g0, real g1, 00346 real& a, real& b, real& c, real& d) { 00347 d = f0; 00348 c = g0; 00349 b = 3*(f1-f0) - 2*g0 - g1; 00350 a = g0 + g1 - 2*(f1 - f0); 00351 } 00352 00354 // dayYuan // 00356 real ConjGradientOptimizer::daiYuan ( 00357 void (*grad)(Optimizer*, const Vec&), 00358 ConjGradientOptimizer* opt) { 00359 real norm_grad = pownorm(opt->delta); 00360 for (int i=0; i<opt->current_opp_gradient.length(); i++) { 00361 opt->tmp_storage[i] = -opt->delta[i] + opt->current_opp_gradient[i]; 00362 } 00363 real gamma = norm_grad / dot(opt->search_direction, opt->tmp_storage); 00364 return gamma; 00365 } 00366 00368 // findDirection // 00370 bool ConjGradientOptimizer::findDirection() { 00371 bool isFinished = false; 00372 real gamma; 00373 switch (find_new_direction_formula) { 00374 case 1: 00375 gamma = conjpomdp(computeOppositeGradient, this); 00376 break; 00377 case 2: 00378 gamma = daiYuan(computeOppositeGradient, this); 00379 break; 00380 case 3: 00381 gamma = fletcherReeves(computeOppositeGradient, this); 00382 break; 00383 case 4: 00384 gamma = hestenesStiefel(computeOppositeGradient, this); 00385 break; 00386 case 5: 00387 gamma = polakRibiere(computeOppositeGradient, this); 00388 break; 00389 default: 00390 cout << "Warning ! Invalid conjugate gradient formula !" << endl; 00391 gamma = 0; 00392 break; 00393 } 00394 // It is suggested to keep gamma >= 0 00395 if (gamma < 0) { 00396 cout << "gamma < 0 ! gamma = " << gamma << " ==> Restarting" << endl; 00397 gamma = 0; 00398 } 00399 else { 00400 real dp = dot(delta, current_opp_gradient); 00401 real delta_n = pownorm(delta); 00402 if (abs(dp) > restart_coeff *delta_n ) { 00403 // cout << "Restart triggered !" << endl; 00404 gamma = 0; 00405 } 00406 } 00407 updateSearchDirection(gamma); 00408 // If the gradient is very small, we can stop ! 00409 // isFinished = pownorm(current_opp_gradient) < 0.0000001; 00410 // TODO This may lead to an erroneous early stop. To investigate ? 00411 isFinished = false; 00412 if (isFinished) 00413 cout << "Gradient is small enough, time to stop" << endl; 00414 return isFinished; 00415 } 00416 00418 // findMinWithCubicInterpol // 00420 real ConjGradientOptimizer::findMinWithCubicInterpol ( 00421 real p1, 00422 real p2, 00423 real mini, 00424 real maxi, 00425 real f0, 00426 real f1, 00427 real g0, 00428 real g1) { 00429 // We prefer p2 > p1 and maxi > mini 00430 real tmp; 00431 if (p2 < p1) { 00432 tmp = p2; 00433 p2 = p1; 00434 p1 = tmp; 00435 } 00436 if (maxi < mini) { 00437 tmp = maxi; 00438 maxi = mini; 00439 mini = tmp; 00440 } 00441 // cout << "Finding min : p1 = " << p1 << " , p2 = " << p2 << " , mini = " << mini << " , maxi = " << maxi << endl; 00442 // The derivatives must be multiplied by (p2-p1), because we write : 00443 // x = p1 + z*(p2-p1) 00444 // with z in [0,1] => df/dz = (p2-p1)*df/dx 00445 g0 = g0 * (p2-p1); 00446 g1 = g1 * (p2-p1); 00447 real a, b, c, d; 00448 // Store the interpolation coefficients in a,b,c,d 00449 cubicInterpol(f0, f1, g0, g1, a, b, c, d); 00450 // cout << "Interpol : a=" << a << " , b=" << b << " , c=" << c << " , d=" << d << endl; 00451 real mini_transformed = mini; 00452 real maxi_transformed = maxi; 00453 if (mini != -FLT_MAX) 00454 mini_transformed = (mini - p1) / (p2 - p1); 00455 if (maxi != FLT_MAX) 00456 maxi_transformed = (maxi - p1) / (p2 - p1); 00457 real xmin = minCubic(a, b, c, mini_transformed, maxi_transformed); 00458 if (xmin == -FLT_MAX || xmin == FLT_MAX) 00459 return xmin; 00460 // cout << "min is : xmin = " << p1 + xmin*(p2-p1) << endl; 00461 return p1 + xmin*(p2-p1); 00462 } 00463 00465 // findMinWithQuadInterpol // 00467 real ConjGradientOptimizer::findMinWithQuadInterpol( 00468 int q, real sum_x, real sum_x_2, real sum_x_3, real sum_x_4, 00469 real sum_c_x_2, real sum_g_x, real sum_c_x, real sum_c, real sum_g) { 00470 00471 real q2 = q*q; 00472 real sum_x_2_quad = sum_x_2 * sum_x_2; 00473 real sum_x_quad = sum_x*sum_x; 00474 real sum_x_2_sum_x = sum_x_2*sum_x; 00475 real sum_x_2_sum_x_3 = sum_x_2*sum_x_3; 00476 real sum_x_3_sum_x = sum_x_3*sum_x; 00477 // TODO This could certainly be optimized more. 00478 real denom = 00479 (-4*q2*sum_x_2 - 3*q*sum_x_2_quad + sum_x_2_quad*sum_x_2 + 00480 q*sum_x_3*sum_x_3 - q2*sum_x_4 - q*sum_x_2*sum_x_4 + 4*q*sum_x_3_sum_x - 00481 2*sum_x_2_sum_x_3*sum_x + 4*q*sum_x_quad + sum_x_4*sum_x_quad); 00482 00483 real a = 00484 -(q2*sum_c_x_2 + 2*q2*sum_g_x - q*sum_c*sum_x_2 + 00485 q*sum_c_x_2*sum_x_2 + 2*q*sum_g_x*sum_x_2 - sum_c*sum_x_2_quad - 00486 q*sum_c_x*sum_x_3 - q*sum_g*sum_x_3 - 2*q*sum_c_x*sum_x - 00487 2*q*sum_g*sum_x + sum_c_x*sum_x_2_sum_x + sum_g*sum_x_2_sum_x + 00488 sum_c*sum_x_3_sum_x + 2*sum_c*sum_x_quad - sum_c_x_2*sum_x_quad - 00489 2*sum_g_x*sum_x_quad) / denom; 00490 00491 real b = 00492 -(4*q*sum_c_x*sum_x_2 + 4*q*sum_g*sum_x_2 - 00493 sum_c_x*sum_x_2_quad - sum_g*sum_x_2_quad - q*sum_c_x_2*sum_x_3 - 00494 2*q*sum_g_x*sum_x_3 + sum_c*sum_x_2_sum_x_3 + q*sum_c_x*sum_x_4 + 00495 q*sum_g*sum_x_4 - 2*q*sum_c_x_2*sum_x - 4*q*sum_g_x*sum_x - 00496 2*sum_c*sum_x_2_sum_x + sum_c_x_2*sum_x_2_sum_x + 00497 2*sum_g_x*sum_x_2_sum_x - sum_c*sum_x_4*sum_x) / denom; 00498 00499 real xmin = -b / (2*a); 00500 return xmin; 00501 00502 } 00503 00505 // fletcherReeves // 00507 real ConjGradientOptimizer::fletcherReeves ( 00508 void (*grad)(Optimizer*, const Vec&), 00509 ConjGradientOptimizer* opt) { 00510 // delta = opposite gradient 00511 real gamma = pownorm(opt->delta) / pownorm(opt->current_opp_gradient); 00512 return gamma; 00513 } 00514 00516 // fletcherSearch // 00518 real ConjGradientOptimizer::fletcherSearch (real mu) { 00519 real alpha = fletcherSearchMain ( 00520 computeCostValue, 00521 computeDerivative, 00522 this, 00523 sigma, 00524 rho, 00525 fmax, 00526 stop_epsilon, 00527 tau1, 00528 tau2, 00529 tau3, 00530 current_step_size, 00531 mu); 00532 return alpha; 00533 } 00534 00536 // fletcherSearchMain // 00538 real ConjGradientOptimizer::fletcherSearchMain ( 00539 real (*f)(real, ConjGradientOptimizer* opt), 00540 real (*g)(real, ConjGradientOptimizer* opt), 00541 ConjGradientOptimizer* opt, 00542 real sigma, 00543 real rho, 00544 real fmax, 00545 real epsilon, 00546 real tau1, 00547 real tau2, 00548 real tau3, 00549 real alpha1, 00550 real mu) { 00551 00552 // Initialization 00553 real alpha0 = 0; 00554 // f0 = f(0), f_0 = f(alpha0), f_1 = f(alpha1) 00555 // g0 = g(0), g_0 = g(alpha0), g_1 = g(alpha1) 00556 // (for the bracketing phase) 00557 real alpha2, f0, f_1=0, f_0, g0, g_1=0, g_0, a1=0, a2, b1=0, b2; 00558 g0 = (*g)(0, opt); 00559 f0 = (*f)(0, opt); 00560 f_0 = f0; 00561 g_0 = g0; 00562 if (mu == FLT_MAX) 00563 mu = (fmax - f0) / (rho * g0); 00564 if (alpha1 == FLT_MAX) 00565 alpha1 = mu / 100; // My own heuristic 00566 if (g0 >= 0) { 00567 cout << "Warning : df/dx(0) >= 0 !" << endl; 00568 return 0; 00569 } 00570 bool isBracketed = false; 00571 alpha2 = alpha1 + 1; // just to avoid a pathological initialization 00572 00573 // Bracketing 00574 while (!isBracketed) { 00575 // cout << "Bracketing : alpha1 = " << alpha1 << endl << " alpha0 = " << alpha0 << endl; 00576 if (alpha1 == mu && alpha1 == alpha2) { // NB: Personal hack... hopefully that should not happen 00577 cout << "Warning : alpha1 == alpha2 == mu during bracketing" << endl; 00578 return alpha1; 00579 } 00580 f_1 = (*f)(alpha1, opt); 00581 if (f_1 <= fmax) { 00582 cout << "fmax reached !" << endl; 00583 opt->early_stop = true; //added by dorionc 00584 return alpha1; 00585 } 00586 if (f_1 > f0 + alpha1 * rho * g0 || f_1 > f_0) { 00587 // NB: in Fletcher's book, there is a typo in the test above 00588 a1 = alpha0; 00589 b1 = alpha1; 00590 isBracketed = true; 00591 // cout << "Bracketing done : f_1 = " << f_1 << " , alpha1 = " << alpha1 << " , f0 = " << f0 << " , g0 = " << g0 << endl; 00592 } else { 00593 g_1 = (*g)(alpha1, opt); 00594 if (abs(g_1) < -sigma * g0) { 00595 // cout << "Low gradient : g=" << abs(g_1) << " < " << (-sigma * g0) << endl; 00596 return alpha1; 00597 } 00598 if (g_1 >= 0) { 00599 a1 = alpha1; 00600 b1 = alpha0; 00601 real tmp = f_0; 00602 f_0 = f_1; 00603 f_1 = tmp; 00604 tmp = g_0; 00605 g_0 = g_1; 00606 g_1 = tmp; 00607 isBracketed = true; 00608 } else { 00609 if (mu <= 2*alpha1 - alpha0) 00610 alpha2 = mu; 00611 else 00612 alpha2 = findMinWithCubicInterpol( 00613 alpha1, alpha0, 00614 2*alpha1 - alpha0, min(mu, alpha1 + tau1 * (alpha1 - alpha0)), 00615 f_1, f_0, g_1, g_0); 00616 } 00617 } 00618 if (!isBracketed) { 00619 alpha0 = alpha1; 00620 alpha1 = alpha2; 00621 f_0 = f_1; 00622 g_0 = g_1; 00623 } 00624 } 00625 00626 // Splitting 00627 // NB: At this point, f_0 = f(a1), f_1 = f(b1) (and the same for g) 00628 // and we'll keep it this way 00629 // We then use f1 = f(alpha1) and g1 = g(alpha1) 00630 real f1,g1; 00631 bool repeated = false; 00632 while (true) { 00633 // cout << "Splitting : alpha1 = " << alpha1 << endl << " a1 = " << a1 << endl << " b1 = " << b1 << endl; 00634 // cout << "Interval : [" << a1 + tau2 * (b1-a1) << " , " << b1 - tau3 * (b1-a1) << "]" << endl; 00635 alpha1 = findMinWithCubicInterpol( 00636 a1, b1, 00637 a1 + tau2 * (b1-a1), b1 - tau3 * (b1-a1), 00638 f_0, f_1, g_0, g_1); 00639 g1 = (*g)(alpha1, opt); 00640 f1 = opt->cost->value[0]; // Shortcut. 00641 bool small= ((a1 - alpha1) * g_0 <= epsilon); 00642 if (small && (a1>0 || repeated)) { 00643 // cout << "Early stop : a1 = " << a1 << " , alpha1 = " << alpha1 << " , g(a1) = " << g_0 << " , epsilon = " << epsilon << endl; 00644 return a1; 00645 } 00646 if (small) repeated=true; 00647 //g1 = (*g)(alpha1, opt); // TODO See why this has been commented 00648 if (f1 > f0 + rho * alpha1 * g0 || f1 >= f_0) { 00649 a2 = a1; 00650 b2 = alpha1; 00651 f_1 = f1; // to keep f_1 = f(b1) in the next iteration 00652 g_1 = g1; // to keep g_1 = g(b1) in the next iteration 00653 } else { 00654 if (abs(g1) <= -sigma * g0) { 00655 // cout << "Found what we were looking for : g(alpha1)=" << abs(g1) << " < " << -sigma * g0 << " with g0 = " << g0 << endl; 00656 return alpha1; 00657 } 00658 if ((b1 - a1) * g1 >= 0) { 00659 b2 = a1; 00660 f_1 = f_0; // to keep f_1 = f(b1) in the next iteration 00661 g_1 = g_0; // to keep g_1 = g(b1) in the next iteration 00662 } else 00663 b2 = b1; 00664 a2 = alpha1; 00665 f_0 = f1; // to keep f_0 = f(a1) in the next iteration 00666 g_0 = g1; // to keep g_0 = g(a1) in the next iteration 00667 } 00668 a1 = a2; 00669 b1 = b2; 00670 } 00671 } 00672 00674 // gSearch // 00676 real ConjGradientOptimizer::gSearch (void (*grad)(Optimizer*, const Vec&)) { 00677 00678 real step = current_step_size; 00679 real sp, sm, pp, pm; 00680 00681 // Backup the initial paremeters values 00682 params.copyTo(tmp_storage); 00683 00684 params.update(step, search_direction); 00685 (*grad)(this, delta); 00686 real prod = dot(delta, search_direction); 00687 if (prod < 0) { 00688 // Step back to bracket the maximum 00689 do { 00690 sp = step; 00691 pp = prod; 00692 step /= 2.0; 00693 params.update(-step, search_direction); 00694 (*grad)(this, delta); 00695 prod = dot(delta, search_direction); 00696 } while (prod < -epsilon); 00697 sm = step; 00698 pm = prod; 00699 } 00700 else { 00701 // Step forward to bracket the maximum 00702 do { 00703 sm = step; 00704 pm = prod; 00705 params.update(step, search_direction); 00706 (*grad)(this, delta); 00707 prod = dot(delta, search_direction); 00708 step *= 2.0; 00709 } while (prod > epsilon); 00710 sp = step; 00711 pp = prod; 00712 } 00713 00714 if (pm > 0 && pp < 0) 00715 step = (sm*pp - sp*pm) / (pp - pm); 00716 else 00717 step = (sm+sp) / 2; 00718 00719 // Restore the original parameters value 00720 params.copyFrom(tmp_storage); 00721 return step; 00722 } 00723 00725 // hestenesStiefel // 00727 real ConjGradientOptimizer::hestenesStiefel ( 00728 void (*grad)(Optimizer*, const Vec&), 00729 ConjGradientOptimizer* opt) { 00730 int i; 00731 // delta = opposite gradient 00732 // (*grad)(opt, opt->delta); // TODO See why this line has been removed. 00733 for (i=0; i<opt->current_opp_gradient.length(); i++) { 00734 opt->tmp_storage[i] = opt->delta[i] - opt->current_opp_gradient[i]; 00735 } 00736 real gamma = -dot(opt->delta, opt->tmp_storage) / dot(opt->search_direction, opt->tmp_storage); 00737 return gamma; 00738 } 00739 00741 // lineSearch // 00743 bool ConjGradientOptimizer::lineSearch() { 00744 real step; 00745 switch (line_search_algo) { 00746 case 1: 00747 step = fletcherSearch(); 00748 break; 00749 case 2: 00750 step = gSearch(computeOppositeGradient); 00751 break; 00752 case 3: 00753 step = newtonSearch(max_steps, initial_step, low_enough); 00754 break; 00755 default: 00756 cout << "Warning ! Invalid conjugate gradient line search algorithm !" << endl; 00757 step = 0; 00758 break; 00759 } 00760 if (step < 0) 00761 cout << "Ouch, negative step !" << endl; 00762 if (step != 0) params.update(step, search_direction); 00763 if (step == 0) 00764 cout << "No more progress made by the line search, stopping" << endl; 00765 return (step == 0); 00766 } 00767 00769 // minCubic // 00771 real ConjGradientOptimizer::minCubic( 00772 real a, real b, real c, 00773 real mini, real maxi) { 00774 if (a == 0 || (b != 0 && abs(a/b) < 0.0001)) // heuristic value for a == 0 00775 return minQuadratic(b, c, mini, maxi); 00776 // f' = 3a.x^2 + 2b.x + c 00777 real aa = 3*a; 00778 real bb = 2*b; 00779 real d = bb*bb - 4 * aa * c; 00780 if (d <= 0) { // the function is monotonous 00781 if (a > 0) 00782 return mini; 00783 else 00784 return maxi; 00785 } else { // the most usual case 00786 d = sqrt(d); 00787 real p2 = (-bb + d) / (2*aa); 00788 if (a > 0) { 00789 if (p2 < mini || mini == -FLT_MAX) 00790 return mini; 00791 if (p2 > maxi) { // the minimum is beyond the range 00792 if ( (mini-maxi) * ( ( a * mini + b) * (mini + maxi) + a * maxi * maxi + c) > 0) 00793 // is the same as: if (a*mini*mini*mini + b*mini*mini + c*mini > 00794 // a*maxi*maxi*maxi + b*maxi*maxi + c*maxi ) 00795 return maxi; 00796 else 00797 return mini; 00798 } 00799 if ((maxi-p2) * (( a * maxi + b) * (maxi + p2) + a * p2 * p2 + c) > 0) 00800 // is the same as: if (a*maxi*maxi*maxi + b*maxi*maxi + c*maxi > 00801 // a*p2*p2*p2 + b*p2*p2 + c*p2) 00802 return p2; 00803 else 00804 return mini; 00805 } else { 00806 if (p2 > maxi || maxi == FLT_MAX) 00807 return maxi; 00808 if (p2 < mini) { // the minimum is before the range 00809 if ((mini-maxi) * (( a * mini + b) * (mini + maxi) + a * maxi * maxi + c) > 0) 00810 // is the same as: if (a*mini*mini*mini + b*mini*mini + c*mini > 00811 // a*maxi*maxi*maxi + b*maxi*maxi + c*maxi ) 00812 return maxi; 00813 else 00814 return mini; 00815 } 00816 if ((maxi-p2) * (( a * maxi + b) * (maxi + p2) + a * p2 * p2 + c) > 0) 00817 // is the same as: if (a*maxi*maxi*maxi + b*maxi*maxi + c*maxi > 00818 // a*p2*p2*p2 + b*p2*p2 + c*p2) 00819 return p2; 00820 else 00821 return maxi; 00822 } 00823 } 00824 } 00825 00827 // minQuadratic // 00829 real ConjGradientOptimizer::minQuadratic( 00830 real a, real b, 00831 real mini, real maxi) { 00832 if (a == 0 || (b != 0 && abs(a/b) < 0.0001)) { // heuristic for a == 0 00833 if (b > 0) 00834 return mini; 00835 else 00836 return maxi; 00837 } 00838 if (a < 0) { 00839 if (mini == -FLT_MAX) 00840 return -FLT_MAX; 00841 if (maxi == FLT_MAX) 00842 return FLT_MAX; 00843 if (mini*mini + mini * b / a > maxi*maxi + maxi * b / a) 00844 return mini; 00845 else 00846 return maxi; 00847 } 00848 // Now, the most usual case 00849 real the_min = -b / (2*a); 00850 if (the_min < mini) 00851 return mini; 00852 if (the_min > maxi) 00853 return maxi; 00854 return the_min; 00855 } 00856 00858 // newtonSearch // 00860 real ConjGradientOptimizer::newtonSearch( 00861 int max_steps, 00862 real initial_step, 00863 real low_enough 00864 ) { 00865 Vec c(max_steps); // the cost 00866 Vec g(max_steps); // the gradient 00867 Vec x(max_steps); // the step 00868 computeCostAndDerivative(0, this, c[0], g[0]); 00869 x[0] = 0; 00870 real step = initial_step; 00871 real x_2 = x[0]*x[0]; 00872 real x_3 = x[0] * x_2; 00873 real x_4 = x_2 * x_2; 00874 real sum_x = x[0]; 00875 real sum_x_2 = x_2; 00876 real sum_x_3 = x_3; 00877 real sum_x_4 = x_4; 00878 real sum_c_x_2 = c[0] * x_2; 00879 real sum_g_x = g[0] * x[0]; 00880 real sum_c_x = c[0] * x[0]; 00881 real sum_c = c[0]; 00882 real sum_g = g[0]; 00883 for (int i=1; i<max_steps; i++) { 00884 computeCostAndDerivative(step, this, c[i], g[i]); 00885 x[i] = step; 00886 if (abs(g[i]) < low_enough) { 00887 // we have reached the minimum 00888 return step; 00889 } 00890 x_2 = x[i]*x[i]; 00891 x_3 = x[i] * x_2; 00892 x_4 = x_2 * x_2; 00893 sum_x += x[i]; 00894 sum_x_2 += x_2; 00895 sum_x_3 += x_3; 00896 sum_x_4 += x_4; 00897 sum_c_x_2 += c[i] * x_2; 00898 sum_g_x += g[i] * x[i]; 00899 sum_c_x += c[i] * x[i]; 00900 sum_c += c[i]; 00901 sum_g += g[i]; 00902 step = findMinWithQuadInterpol(i+1, sum_x, sum_x_2, sum_x_3, sum_x_4, sum_c_x_2, sum_g_x, sum_c_x, sum_c, sum_g); 00903 } 00904 cout << "Warning : minimum not reached, is the cost really quadratic ?" << endl; 00905 return step; 00906 } 00907 00909 // optimize // 00911 real ConjGradientOptimizer::optimize() 00912 { 00913 cout << "Warning ! In ConjGradientOptimizer::optimize This method is deprecated, use optimizeN instead" << endl; 00914 ofstream out; 00915 if (!filename.empty()) { 00916 out.open(filename.c_str()); 00917 } 00918 Vec lastmeancost(cost->size()); 00919 early_stop = false; 00920 00921 // Initiliazation of the structures 00922 computeOppositeGradient(this, current_opp_gradient); 00923 search_direction << current_opp_gradient; // first direction = -grad; 00924 last_improvement = 0.1; // why not ? 00925 real last_cost, current_cost; 00926 cost->fprop(); 00927 last_cost = cost->value[0]; 00928 real df; 00929 00930 // Loop through the epochs 00931 for (int t=0; !early_stop && t<nupdates; t++) { 00932 // TODO Remove this line later 00933 //cout << "cost = " << cost->value[0] << " " << cost->value[1] << " " << cost->value[2] << endl; 00934 00935 // Make a line search along the current search direction 00936 early_stop = lineSearch(); 00937 if (early_stop) 00938 cout << "Early stopping triggered by the line search" << endl; 00939 cost->fprop(); 00940 current_cost = cost->value[0]; 00941 00942 // Find the new search direction 00943 bool early_stop_dir = findDirection(); 00944 if (early_stop_dir) 00945 cout << "Early stopping triggered by direction search" << endl; 00946 early_stop = early_stop || early_stop_dir; 00947 00948 last_improvement = last_cost - current_cost; 00949 last_cost = current_cost; 00950 00951 // This value of current_step_size is suggested by Fletcher 00952 df = max (last_improvement, 10*stop_epsilon); 00953 current_step_size = min(1.0, 2.0 * df / dot(search_direction, current_opp_gradient)); 00954 00955 // Display results TODO ugly copy/paste from GradientOptimizer: to be cleaned ? 00956 if (compute_cost) { 00957 meancost += cost->value; 00958 } 00959 if ((every!=0) && ((t+1)%every==0)) 00960 // normally this is done every epoch 00961 { 00962 //cerr << ">>>>>> nupdates= " << nupdates << " every=" << every << " sumofvar->nsamples=" << sumofvar->nsamples << endl; 00963 if (compute_cost) { 00964 meancost /= real(every); 00965 } 00966 //if (decrease_constant != 0) 00967 // cout << "at t=" << t << ", learning rate = " << learning_rate << endl; 00968 if (compute_cost) { 00969 printStep(cerr, t+1, meancost[0]); 00970 } 00971 if (compute_cost && out) { 00972 printStep(out, t+1, meancost[0]); 00973 } 00974 bool early_stop_mesure = false; 00975 if (compute_cost) { 00976 early_stop_mesure = measure(t+1,meancost); 00977 } 00978 if (early_stop_dir) 00979 cout << "Early stopping triggered by the measurer" << endl; 00980 early_stop = early_stop || early_stop_mesure; 00981 // early_stop = measure(t+1,meancost); // TODO find which is the best between this and the one above 00982 // early_stop_i = (t+1)/every; TODO Remove, must be useless now 00983 if (compute_cost) { 00984 lastmeancost << meancost; 00985 meancost.clear(); 00986 } 00987 } 00988 } 00989 return lastmeancost[0]; 00990 } 00991 00992 00994 // optimizeN // 00996 bool ConjGradientOptimizer::optimizeN(VecStatsCollector& stats_coll) { 00997 real df, current_cost; 00998 meancost.clear(); 00999 int stage_max = stage + nstages; // the stage to reach 01000 01001 #ifdef BOUNDCHECK 01002 if (current_step_size <= 0) { 01003 PLERROR("In ConjGradientOptimizer::optimizeN - current_step_size <= 0, have you called reset() ?"); 01004 } 01005 #endif 01006 01007 if (stage==0) 01008 { 01009 computeOppositeGradient(this, current_opp_gradient); 01010 search_direction << current_opp_gradient; // first direction = -grad; 01011 last_cost = cost->value[0]; 01012 } 01013 01014 if (early_stop) 01015 stats_coll.update(cost->value); 01016 01017 for (; !early_stop && stage<stage_max; stage++) { 01018 01019 // Make a line search along the current search direction 01020 early_stop = lineSearch(); 01021 computeOppositeGradient(this, delta); 01022 current_cost = cost->value[0]; 01023 if (compute_cost) { 01024 meancost += cost->value; 01025 } 01026 stats_coll.update(cost->value); 01027 01028 // Find the new search direction 01029 early_stop = early_stop || findDirection(); 01030 01031 last_improvement = last_cost - current_cost; 01032 last_cost = current_cost; 01033 01034 // This value of current_step_size is suggested by Fletcher 01035 df = max (last_improvement, 10*stop_epsilon); 01036 current_step_size = min(1.0, 2.0*df / dot(search_direction, current_opp_gradient)); 01037 01038 } 01039 01040 if (compute_cost) { 01041 meancost /= real(nstages); 01042 printStep(cerr, stage, meancost[0]); 01043 early_stop = early_stop || measure(stage+1,meancost); 01044 } 01045 01046 // TODO Call the Stats collector 01047 if (early_stop) 01048 cout << "Early Stopping !" << endl; 01049 return early_stop; 01050 } 01051 01053 // polakRibiere // 01055 real ConjGradientOptimizer::polakRibiere ( 01056 void (*grad)(Optimizer*, const Vec&), 01057 ConjGradientOptimizer* opt) { 01058 int i; 01059 // delta = Gradient 01060 real normg = pownorm(opt->current_opp_gradient); 01061 for (i=0; i<opt->current_opp_gradient.length(); i++) { 01062 opt->tmp_storage[i] = opt->delta[i] - opt->current_opp_gradient[i]; 01063 } 01064 real gamma = dot(opt->tmp_storage,opt->delta) / normg; 01065 return gamma; 01066 } 01067 01069 // quadraticInterpol // 01071 void ConjGradientOptimizer::quadraticInterpol( 01072 real f0, real f1, real g0, 01073 real& a, real& b, real& c) { 01074 c = f0; 01075 b = g0; 01076 a = f1 - f0 - g0; 01077 } 01078 01080 // reset // 01082 void ConjGradientOptimizer::reset() { 01083 inherited::reset(); 01084 early_stop = false; 01085 last_improvement = 0.1; // May only influence the speed of first iteration. 01086 current_step_size = starting_step_size; 01087 } 01088 01090 // updateSearchDirection // 01092 void ConjGradientOptimizer::updateSearchDirection(real gamma) { 01093 if (gamma==0) 01094 search_direction << delta; 01095 else 01096 for (int i=0; i<search_direction.length(); i++) 01097 search_direction[i] = delta[i] + gamma * search_direction[i]; 01098 current_opp_gradient << delta; 01099 } 01100 01101 } // end of namespace PLearn