PLearn: NeuralNet.cc Source File

00001 // -*- C++ -*- 00002 00003 // NeuralNet.cc 00004 // Copyright (c) 1998-2002 Pascal Vincent 00005 // Copyright (C) 1999-2002 Yoshua Bengio and University of Montreal 00006 // Copyright (c) 2002 Jean-Sebastien Senecal, Xavier Saint-Mleux, Rejean Ducharme 00007 // 00008 // Redistribution and use in source and binary forms, with or without 00009 // modification, are permitted provided that the following conditions are met: 00010 // 00011 // 1. Redistributions of source code must retain the above copyright 00012 // notice, this list of conditions and the following disclaimer. 00013 // 00014 // 2. Redistributions in binary form must reproduce the above copyright 00015 // notice, this list of conditions and the following disclaimer in the 00016 // documentation and/or other materials provided with the distribution. 00017 // 00018 // 3. The name of the authors may not be used to endorse or promote 00019 // products derived from this software without specific prior written 00020 // permission. 00021 // 00022 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00023 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00024 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00025 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00026 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00027 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00028 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00029 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00030 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00031 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00032 // 00033 // This file is part of the PLearn library. For more information on the PLearn 00034 // library, go to the PLearn Web site at www.plearn.org 00035 00036 00037 /* ******************************************************* 00038 * $Id: NeuralNet.cc,v 1.22 2004/07/21 16:30:56 chrish42 Exp $ 00039 ******************************************************* */ 00040 00043 #include <plearn/var/AffineTransformVariable.h> 00044 #include <plearn/var/AffineTransformWeightPenalty.h> 00045 #include <plearn/var/BinaryClassificationLossVariable.h> 00046 #include <plearn/var/ClassificationLossVariable.h> 00047 #include <plearn/var/ConcatColumnsVariable.h> 00048 #include <plearn/var/CrossEntropyVariable.h> 00049 #include <plearn/var/ExpVariable.h> 00050 #include <plearn/var/IfThenElseVariable.h> 00051 #include <plearn/var/LiftOutputVariable.h> 00052 #include <plearn/var/LogSoftmaxVariable.h> 00053 #include <plearn/var/MulticlassLossVariable.h> 00054 #include <plearn/var/NegCrossEntropySigmoidVariable.h> 00055 #include <plearn/var/OneHotSquaredLoss.h> 00056 #include <plearn/var/SemiSupervisedProbClassCostVariable.h> 00057 #include <plearn/var/SigmoidVariable.h> 00058 #include <plearn/var/SoftmaxVariable.h> 00059 #include <plearn/var/SoftplusVariable.h> 00060 #include <plearn/var/SourceVariable.h> 00061 #include <plearn/var/SubMatVariable.h> 00062 #include <plearn/var/SumVariable.h> 00063 #include <plearn/var/SumOfVariable.h> 00064 #include <plearn/var/SumSquareVariable.h> 00065 #include <plearn/var/TanhVariable.h> 00066 #include <plearn/var/TransposeProductVariable.h> 00067 #include <plearn/var/Var_operators.h> 00068 #include <plearn/var/Var_utils.h> 00069 #include <plearn/var/WeightedSumSquareVariable.h> 00070 00071 #include "NeuralNet.h" 00072 //#include "DisplayUtils.h" 00073 #include <plearn/math/random.h> 00074 //#include "GradientOptimizer.h" 00075 #include <plearn/var/SemiSupervisedProbClassCostVariable.h> 00076 #include <plearn/var/IsMissingVariable.h> 00077 00078 namespace PLearn { 00079 using namespace std; 00080 00081 00082 PLEARN_IMPLEMENT_OBJECT(NeuralNet, "DEPRECATED: Use NNet instead", "NO HELP"); 00083 00084 NeuralNet::NeuralNet() 00085 :nhidden(0), 00086 nhidden2(0), 00087 weight_decay(0), 00088 bias_decay(0), 00089 layer1_weight_decay(0), 00090 layer1_bias_decay(0), 00091 layer2_weight_decay(0), 00092 layer2_bias_decay(0), 00093 output_layer_weight_decay(0), 00094 output_layer_bias_decay(0), 00095 direct_in_to_out_weight_decay(0), 00096 direct_in_to_out(false), 00097 output_transfer_func(""), 00098 iseed(-1), 00099 semisupervised_flatten_factor(1), 00100 batch_size(1), 00101 nepochs(10000), 00102 saveparams("") 00103 {} 00104 00105 NeuralNet::~NeuralNet() 00106 { 00107 } 00108 00109 void NeuralNet::declareOptions(OptionList& ol) 00110 { 00111 declareOption(ol, "nhidden", &NeuralNet::nhidden, OptionBase::buildoption, 00112 " number of hidden units in first hidden layer (0 means no hidden layer)\n"); 00113 00114 declareOption(ol, "nhidden2", &NeuralNet::nhidden2, OptionBase::buildoption, 00115 " number of hidden units in second hidden layer (0 means no hidden layer)\n"); 00116 00117 declareOption(ol, "weight_decay", &NeuralNet::weight_decay, OptionBase::buildoption, 00118 " global weight decay for all layers\n"); 00119 00120 declareOption(ol, "bias_decay", &NeuralNet::bias_decay, OptionBase::buildoption, 00121 " global bias decay for all layers\n"); 00122 00123 declareOption(ol, "layer1_weight_decay", &NeuralNet::layer1_weight_decay, OptionBase::buildoption, 00124 " Additional weight decay for the first hidden layer. Is added to weight_decay.\n"); 00125 declareOption(ol, "layer1_bias_decay", &NeuralNet::layer1_bias_decay, OptionBase::buildoption, 00126 " Additional bias decay for the first hidden layer. Is added to bias_decay.\n"); 00127 00128 declareOption(ol, "layer2_weight_decay", &NeuralNet::layer2_weight_decay, OptionBase::buildoption, 00129 " Additional weight decay for the second hidden layer. Is added to weight_decay.\n"); 00130 00131 declareOption(ol, "layer2_bias_decay", &NeuralNet::layer2_bias_decay, OptionBase::buildoption, 00132 " Additional bias decay for the second hidden layer. Is added to bias_decay.\n"); 00133 00134 declareOption(ol, "output_layer_weight_decay", &NeuralNet::output_layer_weight_decay, OptionBase::buildoption, 00135 " Additional weight decay for the output layer. Is added to 'weight_decay'.\n"); 00136 00137 declareOption(ol, "output_layer_bias_decay", &NeuralNet::output_layer_bias_decay, OptionBase::buildoption, 00138 " Additional bias decay for the output layer. Is added to 'bias_decay'.\n"); 00139 00140 declareOption(ol, "direct_in_to_out_weight_decay", &NeuralNet::direct_in_to_out_weight_decay, OptionBase::buildoption, 00141 " Additional weight decay for the direct in-to-out layer. Is added to 'weight_decay'.\n"); 00142 00143 declareOption(ol, "direct_in_to_out", &NeuralNet::direct_in_to_out, OptionBase::buildoption, 00144 " should we include direct input to output connections?\n"); 00145 00146 declareOption(ol, "output_transfer_func", &NeuralNet::output_transfer_func, OptionBase::buildoption, 00147 " what transfer function to use for ouput layer? \n" 00148 " one of: tanh, sigmoid, exp, softmax \n" 00149 " an empty string means no output transfer function \n"); 00150 00151 declareOption(ol, "seed", &NeuralNet::iseed, OptionBase::buildoption, 00152 " Seed for the random number generator used to initialize parameters. If -1 then use time of day.\n"); 00153 00154 declareOption(ol, "cost_funcs", &NeuralNet::cost_funcs, OptionBase::buildoption, 00155 " a list of cost functions to use\n" 00156 " in the form \"[ cf1; cf2; cf3; ... ]\" where each function is one of: \n" 00157 " mse (for regression)\n" 00158 " mse_onehot (for classification)\n" 00159 " NLL (negative log likelihood -log(p[c]) for classification) \n" 00160 " class_error (classification error) \n" 00161 " semisupervised_prob_class\n" 00162 " The first function of the list will be used as \n" 00163 " the objective function to optimize \n" 00164 " (possibly with an added weight decay penalty) \n" 00165 " If semisupervised_prob_class is chosen, then the options\n" 00166 " semisupervised_{flatten_factor,prior} will be used. Note that\n" 00167 " the output_transfer_func should be the softmax, in that case.\n" 00168 ); 00169 00170 declareOption(ol, "semisupervised_flatten_factor", &NeuralNet::semisupervised_flatten_factor, OptionBase::buildoption, 00171 " Hyper-parameter of the semi-supervised criterion for probabilistic classifiers\n"); 00172 00173 declareOption(ol, "semisupervised_prior", &NeuralNet::semisupervised_prior, OptionBase::buildoption, 00174 " Hyper-parameter of the semi-supervised criterion = prior classes probabilities\n"); 00175 00176 declareOption(ol, "optimizer", &NeuralNet::optimizer, OptionBase::buildoption, 00177 " specify the optimizer to use\n"); 00178 00179 declareOption(ol, "batch_size", &NeuralNet::batch_size, OptionBase::buildoption, 00180 " how many samples to use to estimate the avergage gradient before updating the weights\n" 00181 " 0 is equivalent to specifying training_set->length() \n" 00182 " NOTE: this overrides the optimizer's 'n_updates' and 'every_iterations'.\n"); 00183 00184 declareOption(ol, "nepochs", &NeuralNet::nepochs, OptionBase::buildoption, 00185 " how many times the optimizer gets to see the whole training set.\n"); 00186 00187 declareOption(ol, "paramsvalues", &NeuralNet::paramsvalues, OptionBase::learntoption, 00188 " The learned parameter vector (in which order?)\n"); 00189 00190 declareOption(ol, "saveparams", &NeuralNet::saveparams, OptionBase::learntoption, 00191 " This string, if not empty, indicates where in the expdir directory\n" 00192 " to save the final paramsvalues\n"); 00193 00194 declareOption(ol, "normalization", &NeuralNet::normalization, OptionBase::buildoption, 00195 " The normalization to be applied to the data\n"); 00196 inherited::declareOptions(ol); 00197 00198 } 00199 00200 void NeuralNet::build() 00201 { 00202 inherited::build(); 00203 build_(); 00204 } 00205 00206 void NeuralNet::build_() 00207 { 00208 /* 00209 * Create Topology Var Graph 00210 */ 00211 00212 // init. basic vars 00213 input = Var(inputsize(), "input"); 00214 if (normalization.length()) { 00215 Var means(normalization[0]); 00216 Var stddevs(normalization[1]); 00217 output = (input - means) / stddevs; 00218 } else 00219 output = input; 00220 params.resize(0); 00221 00222 // first hidden layer 00223 if(nhidden>0) 00224 { 00225 w1 = Var(1+inputsize(), nhidden, "w1"); 00226 output = tanh(affine_transform(output,w1)); 00227 params.append(w1); 00228 } 00229 00230 // second hidden layer 00231 if(nhidden2>0) 00232 { 00233 w2 = Var(1+nhidden, nhidden2, "w2"); 00234 output = tanh(affine_transform(output,w2)); 00235 params.append(w2); 00236 } 00237 00238 // output layer before transfer function 00239 wout = Var(1+output->size(), outputsize(), "wout"); 00240 output = affine_transform(output,wout); 00241 params.append(wout); 00242 00243 // direct in-to-out layer 00244 if(direct_in_to_out) 00245 { 00246 wdirect = Var(inputsize(), outputsize(), "wdirect");// Var(1+inputsize(), outputsize(), "wdirect"); 00247 output += transposeProduct(wdirect, input);// affine_transform(input,wdirect); 00248 params.append(wdirect); 00249 } 00250 00251 /* 00252 * output_transfer_func 00253 */ 00254 if(output_transfer_func!="") 00255 { 00256 if(output_transfer_func=="tanh") 00257 output = tanh(output); 00258 else if(output_transfer_func=="sigmoid") 00259 output = sigmoid(output); 00260 else if(output_transfer_func=="softplus") 00261 output = softplus(output); 00262 else if(output_transfer_func=="exp") 00263 output = exp(output); 00264 else if(output_transfer_func=="softmax") 00265 output = softmax(output); 00266 else if (output_transfer_func == "log_softmax") 00267 output = log_softmax(output); 00268 else 00269 PLERROR("In NeuralNet::build_() unknown output_transfer_func option: %s",output_transfer_func.c_str()); 00270 } 00271 00272 /* 00273 * target & weights 00274 */ 00275 if(weightsize() != 0 && weightsize() != 1 && targetsize()/2 != weightsize()) 00276 PLERROR("In NeuralNet::build_() weightsize must be either:\n" 00277 "\t0: no weights on costs\n" 00278 "\t1: single weight applied on total cost\n" 00279 "\ttargetsize/2: vector of weights applied individually to each component of the cost\n" 00280 "weightsize= %d; targetsize= %d.", weightsize(), targetsize()); 00281 00282 00283 target_and_weights= Var(targetsize(), "target_and_weights"); 00284 target = new SubMatVariable(target_and_weights, 0, 0, targetsize()-weightsize(), 1); 00285 target->setName("target"); 00286 if(0 < weightsize()) 00287 { 00288 costweights = new SubMatVariable(target_and_weights, targetsize()-weightsize(), 0, weightsize(), 1); 00289 costweights->setName("costweights"); 00290 } 00291 /* 00292 * costfuncs 00293 */ 00294 int ncosts = cost_funcs.size(); 00295 if(ncosts<=0) 00296 PLERROR("In NeuralNet::build_() Empty cost_funcs : must at least specify the cost function to optimize!"); 00297 costs.resize(ncosts); 00298 00299 for(int k=0; k<ncosts; k++) 00300 { 00301 bool handles_missing_target=false; 00302 // create costfuncs and apply individual weights if weightsize() > 1 00303 if(cost_funcs[k]=="mse") 00304 if(weightsize() < 2) 00305 costs[k]= sumsquare(output-target); 00306 else 00307 costs[k]= weighted_sumsquare(output-target, costweights); 00308 else if(cost_funcs[k]=="mse_onehot") 00309 costs[k] = onehot_squared_loss(output, target); 00310 else if(cost_funcs[k]=="NLL") { 00311 if (output_transfer_func == "log_softmax") 00312 costs[k] = -output[target]; 00313 else 00314 costs[k] = neg_log_pi(output, target); 00315 } else if(cost_funcs[k]=="class_error") 00316 costs[k] = classification_loss(output, target); 00317 else if(cost_funcs[k]=="multiclass_error") 00318 if(weightsize() < 2) 00319 costs[k] = multiclass_loss(output, target); 00320 else 00321 PLERROR("In NeuralNet::build() weighted multiclass error cost not implemented."); 00322 else if(cost_funcs[k]=="cross_entropy") 00323 if(weightsize() < 2) 00324 costs[k] = cross_entropy(output, target); 00325 else 00326 PLERROR("In NeuralNet::build() weighted cross entropy cost not implemented."); 00327 else if (cost_funcs[k]=="semisupervised_prob_class") 00328 { 00329 if (output_transfer_func!="softmax") 00330 PLWARNING("To properly use the semisupervised_prob_class criterion, the transfer function should probably be a softmax, to guarantee positive probabilities summing to 1"); 00331 if (semisupervised_prior.length()==0) // default value is (1,1,1...) 00332 { 00333 semisupervised_prior.resize(outputsize()); 00334 semisupervised_prior.fill(1.0); 00335 } 00336 costs[k] = new SemiSupervisedProbClassCostVariable(output,target,new SourceVariable(semisupervised_prior), 00337 semisupervised_flatten_factor); 00338 handles_missing_target=true; 00339 } 00340 else 00341 { 00342 costs[k]= dynamic_cast<Variable*>(newObject(cost_funcs[k])); 00343 if(costs[k].isNull()) 00344 PLERROR("In NeuralNet::build_() unknown cost_func option: %s",cost_funcs[k].c_str()); 00345 if(weightsize() < 2) 00346 costs[k]->setParents(output & target); 00347 else 00348 costs[k]->setParents(output & target & costweights); 00349 costs[k]->build(); 00350 } 00351 00352 // apply a single global weight if weightsize() == 1 00353 if(1 == weightsize()) 00354 costs[k]= costs[k] * costweights; 00355 00356 if (!handles_missing_target) 00357 costs[k] = ifThenElse(isMissing(target),var(MISSING_VALUE),costs[k]); 00358 } 00359 00360 00361 /* 00362 * weight and bias decay penalty 00363 */ 00364 00365 // create penalties 00366 VarArray penalties; 00367 if(w1 && ((layer1_weight_decay + weight_decay)!=0 || (layer1_bias_decay + bias_decay)!=0)) 00368 penalties.append(affine_transform_weight_penalty(w1, (layer1_weight_decay + weight_decay), (layer1_bias_decay + bias_decay))); 00369 if(w2 && ((layer2_weight_decay + weight_decay)!=0 || (layer2_bias_decay + bias_decay)!=0)) 00370 penalties.append(affine_transform_weight_penalty(w2, (layer2_weight_decay + weight_decay), (layer2_bias_decay + bias_decay))); 00371 if(wout && ((output_layer_weight_decay + weight_decay)!=0 || (output_layer_bias_decay + bias_decay)!=0)) 00372 penalties.append(affine_transform_weight_penalty(wout, (output_layer_weight_decay + weight_decay), (output_layer_bias_decay + bias_decay))); 00373 if(wdirect && (direct_in_to_out_weight_decay + weight_decay) != 0) 00374 penalties.append(sumsquare(wdirect)*(direct_in_to_out_weight_decay + weight_decay)); 00375 00376 // apply penalty to cost 00377 if(penalties.size() != 0) 00378 cost = hconcat( sum(hconcat(costs[0] & penalties)) & costs ); 00379 else 00380 cost = hconcat(costs[0] & costs); 00381 00382 00383 cost->setName("cost"); 00384 output->setName("output"); 00385 00386 // norman: ambiguous conversion (bool or char*?) 00387 //if(paramsvalues && (paramsvalues.size() == params.nelems())) 00388 if((bool)(paramsvalues) && (paramsvalues.size() == params.nelems())) 00389 { 00390 params << paramsvalues; 00391 initial_paramsvalues.resize(paramsvalues.length()); 00392 initial_paramsvalues << paramsvalues; 00393 } 00394 else 00395 { 00396 paramsvalues.resize(params.nelems()); 00397 initializeParams(); 00398 } 00399 params.makeSharedValue(paramsvalues); 00400 00401 // Funcs 00402 00403 f = Func(input, output); 00404 costf = Func(input&target_and_weights, output&cost); 00405 costf->recomputeParents(); 00406 output_and_target_to_cost = Func(output&target_and_weights, cost); 00407 output_and_target_to_cost->recomputeParents(); 00408 } 00409 00410 Array<string> NeuralNet::costNames() const 00411 { 00412 return (cost_funcs[0]+"+penalty") & cost_funcs; 00413 } 00414 00415 int NeuralNet::costsize() const 00416 { return cost->size(); } 00417 00418 void NeuralNet::train(VMat training_set) 00419 { 00420 setTrainingSet(training_set); 00421 int l = training_set->length(); 00422 int nsamples = batch_size>0 ? batch_size : l; 00423 Func paramf = Func(input&target_and_weights, cost); // parameterized function to optimize 00424 Var totalcost = meanOf(training_set,paramf, nsamples); 00425 optimizer->setToOptimize(params, totalcost); 00426 optimizer->nupdates = (nepochs*l)/nsamples; 00427 optimizer->every = l/nsamples; 00428 optimizer->addMeasurer(*this); 00429 optimizer->build(); 00430 optimizer->optimize(); 00431 00432 output_and_target_to_cost->recomputeParents(); 00433 costf->recomputeParents(); 00434 // cerr << "totalcost->value = " << totalcost->value << endl; 00435 setTrainCost(totalcost->value); 00436 if (saveparams!="") 00437 PLearn::save(expdir+saveparams,paramsvalues); 00438 } 00439 00440 00441 void NeuralNet::initializeParams() 00442 { 00443 if (iseed<0) 00444 seed(); 00445 else 00446 manual_seed(iseed); 00447 //real delta = 1./sqrt(inputsize()); 00448 real delta = 1./inputsize(); 00449 /* 00450 if(direct_in_to_out) 00451 { 00452 //fill_random_uniform(wdirect->value, -delta, +delta); 00453 fill_random_normal(wdirect->value, 0, delta); 00454 //wdirect->matValue(0).clear(); 00455 } 00456 */ 00457 if(nhidden>0) 00458 { 00459 //fill_random_uniform(w1->value, -delta, +delta); 00460 //delta = 1./sqrt(nhidden); 00461 fill_random_normal(w1->value, 0, delta); 00462 if(direct_in_to_out) 00463 { 00464 //fill_random_uniform(wdirect->value, -delta, +delta); 00465 fill_random_normal(wdirect->value, 0, delta); 00466 wdirect->matValue(0).clear(); 00467 } 00468 delta = 1./nhidden; 00469 w1->matValue(0).clear(); 00470 } 00471 if(nhidden2>0) 00472 { 00473 //fill_random_uniform(w2->value, -delta, +delta); 00474 //delta = 1./sqrt(nhidden2); 00475 fill_random_normal(w2->value, 0, delta); 00476 delta = 1./nhidden2; 00477 w2->matValue(0).clear(); 00478 } 00479 //fill_random_uniform(wout->value, -delta, +delta); 00480 fill_random_normal(wout->value, 0, delta); 00481 wout->matValue(0).clear(); 00482 } 00483 00484 void NeuralNet::use(const Vec& in, Vec& prediction) 00485 { 00486 f->fprop(in,prediction); 00487 } 00488 00489 void NeuralNet::useAndCost(const Vec& inputvec, const Vec& targetvec, Vec outputvec, Vec costvec) 00490 { 00491 costf->fprop(inputvec&targetvec, outputvec&costvec); 00492 } 00493 00494 void NeuralNet::computeCost(const Vec& inputvec, const Vec& targetvec, const Vec& outputvec, const Vec& costvec) 00495 { 00496 output_and_target_to_cost->fprop(outputvec&targetvec, costvec); 00497 } 00498 00499 void NeuralNet::forget() 00500 { 00501 if(initial_paramsvalues) 00502 params << initial_paramsvalues; 00503 else 00504 initializeParams(); 00505 inherited::forget(); 00506 } 00507 00508 void NeuralNet::makeDeepCopyFromShallowCopy(CopiesMap& copies) 00509 { 00510 inherited::makeDeepCopyFromShallowCopy(copies); 00511 deepCopyField(optimizer, copies); 00512 } 00513 00514 } // end of namespace PLearn