PLearn: NNet.cc Source File

00001 // -*- C++ -*- 00002 00003 // NNet.cc 00004 // Copyright (c) 1998-2002 Pascal Vincent 00005 // Copyright (C) 1999-2002 Yoshua Bengio and University of Montreal 00006 // Copyright (c) 2002 Jean-Sebastien Senecal, Xavier Saint-Mleux, Rejean Ducharme 00007 // 00008 // Redistribution and use in source and binary forms, with or without 00009 // modification, are permitted provided that the following conditions are met: 00010 // 00011 // 1. Redistributions of source code must retain the above copyright 00012 // notice, this list of conditions and the following disclaimer. 00013 // 00014 // 2. Redistributions in binary form must reproduce the above copyright 00015 // notice, this list of conditions and the following disclaimer in the 00016 // documentation and/or other materials provided with the distribution. 00017 // 00018 // 3. The name of the authors may not be used to endorse or promote 00019 // products derived from this software without specific prior written 00020 // permission. 00021 // 00022 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00023 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00024 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00025 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00026 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00027 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00028 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00029 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00030 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00031 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00032 // 00033 // This file is part of the PLearn library. For more information on the PLearn 00034 // library, go to the PLearn Web site at www.plearn.org 00035 00036 00037 /* ******************************************************* 00038 * $Id: NNet.cc,v 1.56 2004/07/21 16:30:56 chrish42 Exp $ 00039 ******************************************************* */ 00040 00043 #include <plearn/var/AffineTransformVariable.h> 00044 #include <plearn/var/AffineTransformWeightPenalty.h> 00045 #include <plearn/var/BinaryClassificationLossVariable.h> 00046 #include <plearn/var/ClassificationLossVariable.h> 00047 #include <plearn/var/ConcatColumnsVariable.h> 00048 #include <plearn/var/CrossEntropyVariable.h> 00049 #include <plearn/var/ExpVariable.h> 00050 #include <plearn/var/LiftOutputVariable.h> 00051 #include <plearn/var/LogSoftmaxVariable.h> 00052 #include <plearn/var/MarginPerceptronCostVariable.h> 00053 #include <plearn/var/MulticlassLossVariable.h> 00054 #include <plearn/var/NegCrossEntropySigmoidVariable.h> 00055 #include <plearn/var/OneHotSquaredLoss.h> 00056 // #include "RBFLayerVariable.h" //TODO Put it back when the file exists. 00057 #include <plearn/var/SigmoidVariable.h> 00058 #include <plearn/var/SoftmaxVariable.h> 00059 #include <plearn/var/SoftplusVariable.h> 00060 #include <plearn/var/SumVariable.h> 00061 #include <plearn/var/SumAbsVariable.h> 00062 #include <plearn/var/SumOfVariable.h> 00063 #include <plearn/var/SumSquareVariable.h> 00064 #include <plearn/var/TanhVariable.h> 00065 #include <plearn/var/TransposeProductVariable.h> 00066 #include <plearn/var/UnaryHardSlopeVariable.h> 00067 #include <plearn/var/Var_operators.h> 00068 #include <plearn/var/Var_utils.h> 00069 00070 #include <plearn/vmat/ConcatColumnsVMatrix.h> 00071 //#include "DisplayUtils.h" 00072 //#include "GradientOptimizer.h" 00073 #include "NNet.h" 00074 #include <plearn/math/random.h> 00075 #include <plearn/vmat/SubVMatrix.h> 00076 00077 namespace PLearn { 00078 using namespace std; 00079 00080 PLEARN_IMPLEMENT_OBJECT(NNet, "Ordinary Feedforward Neural Network with 1 or 2 hidden layers", 00081 "Neural network with many bells and whistles..."); 00082 00083 NNet::NNet() // DEFAULT VALUES FOR ALL OPTIONS 00084 : 00085 nhidden(0), 00086 nhidden2(0), 00087 noutputs(0), 00088 weight_decay(0), 00089 bias_decay(0), 00090 layer1_weight_decay(0), 00091 layer1_bias_decay(0), 00092 layer2_weight_decay(0), 00093 layer2_bias_decay(0), 00094 output_layer_weight_decay(0), 00095 output_layer_bias_decay(0), 00096 direct_in_to_out_weight_decay(0), 00097 classification_regularizer(0), 00098 margin(1), 00099 fixed_output_weights(0), 00100 rbf_layer_size(0), 00101 first_class_is_junk(1), 00102 L1_penalty(false), 00103 input_reconstruction_penalty(0), 00104 direct_in_to_out(false), 00105 output_transfer_func(""), 00106 hidden_transfer_func("tanh"), 00107 interval_minval(0), interval_maxval(1), 00108 batch_size(1), 00109 initialization_method("normal_linear") 00110 {} 00111 00112 NNet::~NNet() 00113 { 00114 } 00115 00116 void NNet::declareOptions(OptionList& ol) 00117 { 00118 declareOption(ol, "nhidden", &NNet::nhidden, OptionBase::buildoption, 00119 " number of hidden units in first hidden layer (0 means no hidden layer)\n"); 00120 00121 declareOption(ol, "nhidden2", &NNet::nhidden2, OptionBase::buildoption, 00122 " number of hidden units in second hidden layer (0 means no hidden layer)\n"); 00123 00124 declareOption(ol, "noutputs", &NNet::noutputs, OptionBase::buildoption, 00125 " number of output units. This gives this learner its outputsize.\n" 00126 " It is typically of the same dimensionality as the target for regression problems \n" 00127 " But for classification problems where target is just the class number, noutputs is \n" 00128 " usually of dimensionality number of classes (as we want to output a score or probability \n" 00129 " vector, one per class)"); 00130 00131 declareOption(ol, "weight_decay", &NNet::weight_decay, OptionBase::buildoption, 00132 " global weight decay for all layers\n"); 00133 00134 declareOption(ol, "bias_decay", &NNet::bias_decay, OptionBase::buildoption, 00135 " global bias decay for all layers\n"); 00136 00137 declareOption(ol, "layer1_weight_decay", &NNet::layer1_weight_decay, OptionBase::buildoption, 00138 " Additional weight decay for the first hidden layer. Is added to weight_decay.\n"); 00139 00140 declareOption(ol, "layer1_bias_decay", &NNet::layer1_bias_decay, OptionBase::buildoption, 00141 " Additional bias decay for the first hidden layer. Is added to bias_decay.\n"); 00142 00143 declareOption(ol, "layer2_weight_decay", &NNet::layer2_weight_decay, OptionBase::buildoption, 00144 " Additional weight decay for the second hidden layer. Is added to weight_decay.\n"); 00145 00146 declareOption(ol, "layer2_bias_decay", &NNet::layer2_bias_decay, OptionBase::buildoption, 00147 " Additional bias decay for the second hidden layer. Is added to bias_decay.\n"); 00148 00149 declareOption(ol, "output_layer_weight_decay", &NNet::output_layer_weight_decay, OptionBase::buildoption, 00150 " Additional weight decay for the output layer. Is added to 'weight_decay'.\n"); 00151 00152 declareOption(ol, "output_layer_bias_decay", &NNet::output_layer_bias_decay, OptionBase::buildoption, 00153 " Additional bias decay for the output layer. Is added to 'bias_decay'.\n"); 00154 00155 declareOption(ol, "direct_in_to_out_weight_decay", &NNet::direct_in_to_out_weight_decay, OptionBase::buildoption, 00156 " Additional weight decay for the direct in-to-out layer. Is added to 'weight_decay'.\n"); 00157 00158 declareOption(ol, "L1_penalty", &NNet::L1_penalty, OptionBase::buildoption, 00159 " should we use L1 penalty instead of the default L2 penalty on the weights?\n"); 00160 00161 declareOption(ol, "fixed_output_weights", &NNet::fixed_output_weights, OptionBase::buildoption, 00162 " If true then the output weights are not learned. They are initialized to +1 or -1 randomly.\n"); 00163 00164 declareOption(ol, "input_reconstruction_penalty", &NNet::input_reconstruction_penalty, OptionBase::buildoption, 00165 " if >0 then a set of weights will be added from a hidden layer to predict (reconstruct) the inputs\n" 00166 " and the total loss will include an extra term that is the squared input reconstruction error,\n" 00167 " multiplied by the input_reconstruction_penalty factor.\n"); 00168 00169 declareOption(ol, "direct_in_to_out", &NNet::direct_in_to_out, OptionBase::buildoption, 00170 " should we include direct input to output connections?\n"); 00171 00172 declareOption(ol, "rbf_layer_size", &NNet::rbf_layer_size, OptionBase::buildoption, 00173 " If non-zero, add an extra layer which computes N(h(x);mu_i,sigma_i) (Gaussian density) for the\n" 00174 " i-th output unit with mu_i a free vector and sigma_i a free scalar, and h(x) the vector of\n" 00175 " activations of the 'representation' output, i.e. what would be the output layer otherwise. The\n" 00176 " given non-zero value is the number of these 'representation' outputs. Typically this\n" 00177 " makes sense for classification problems, with a softmax output_transfer_func. If the\n" 00178 " first_class_is_junk option is set then the first output (first class) does not get a\n" 00179 " Gaussian density but just a 'pseudo-uniform' density (the single free parameter is the\n" 00180 " value of that density) and in a softmax it makes sure that when h(x) is far from the\n" 00181 " centers mu_i for all the other classes then the last class gets the strongest posterior probability.\n"); 00182 00183 declareOption(ol, "first_class_is_junk", &NNet::first_class_is_junk, OptionBase::buildoption, 00184 " This option is used only when rbf_layer_size>0. If true then the first class is\n" 00185 " treated differently and gets a pre-transfer-function value that is a learned constant, whereas\n" 00186 " the others get a normal centered at mu_i.\n"); 00187 00188 declareOption(ol, "output_transfer_func", &NNet::output_transfer_func, OptionBase::buildoption, 00189 " what transfer function to use for ouput layer? \n" 00190 " one of: tanh, sigmoid, exp, softplus, softmax, log_softmax \n" 00191 " or interval(<minval>,<maxval>), which stands for\n" 00192 " <minval>+(<maxval>-<minval>)*sigmoid(.).\n" 00193 " An empty string or \"none\" means no output transfer function \n"); 00194 00195 declareOption(ol, "hidden_transfer_func", &NNet::hidden_transfer_func, OptionBase::buildoption, 00196 " what transfer function to use for hidden units? \n" 00197 " one of: linear, tanh, sigmoid, exp, softplus, softmax, log_softmax, hard_slope or symm_hard_slope\n"); 00198 00199 declareOption(ol, "cost_funcs", &NNet::cost_funcs, OptionBase::buildoption, 00200 " a list of cost functions to use\n" 00201 " in the form \"[ cf1; cf2; cf3; ... ]\" where each function is one of: \n" 00202 " mse (for regression)\n" 00203 " mse_onehot (for classification)\n" 00204 " NLL (negative log likelihood -log(p[c]) for classification) \n" 00205 " class_error (classification error) \n" 00206 " binary_class_error (classification error for a 0-1 binary classifier)\n" 00207 " multiclass_error\n" 00208 " cross_entropy (for binary classification)\n" 00209 " stable_cross_entropy (more accurate backprop and possible regularization, for binary classification)\n" 00210 " margin_perceptron_cost (a hard version of the cross_entropy, uses the 'margin' option)\n" 00211 " lift_output (not a real cost function, just the output for lift computation)\n" 00212 " The first function of the list will be used as \n" 00213 " the objective function to optimize \n" 00214 " (possibly with an added weight decay penalty) \n"); 00215 00216 declareOption(ol, "classification_regularizer", &NNet::classification_regularizer, OptionBase::buildoption, 00217 " used only in the stable_cross_entropy cost function, to fight overfitting (0<=r<1)\n"); 00218 00219 declareOption(ol, "margin", &NNet::margin, OptionBase::buildoption, 00220 " margin requirement, used only with the margin_perceptron_cost cost function.\n" 00221 " It should be positive, and larger values regularize more.\n"); 00222 00223 declareOption(ol, "optimizer", &NNet::optimizer, OptionBase::buildoption, 00224 " specify the optimizer to use\n"); 00225 00226 declareOption(ol, "batch_size", &NNet::batch_size, OptionBase::buildoption, 00227 " how many samples to use to estimate the avergage gradient before updating the weights\n" 00228 " 0 is equivalent to specifying training_set->length() \n"); 00229 00230 declareOption(ol, "initialization_method", &NNet::initialization_method, OptionBase::buildoption, 00231 " The method used to initialize the weights:\n" 00232 " - normal_linear = a normal law with variance 1 / n_inputs\n" 00233 " - normal_sqrt = a normal law with variance 1 / sqrt(n_inputs)\n"); 00234 00235 declareOption(ol, "paramsvalues", &NNet::paramsvalues, OptionBase::learntoption, 00236 " The learned parameter vector\n"); 00237 00238 inherited::declareOptions(ol); 00239 00240 } 00241 00242 void NNet::build() 00243 { 00244 inherited::build(); 00245 build_(); 00246 } 00247 00248 void NNet::build_() 00249 { 00250 /* 00251 * Create Topology Var Graph 00252 */ 00253 00254 // Don't do anything if we don't have a train_set 00255 // It's the only one who knows the inputsize and targetsize anyway... 00256 00257 if(inputsize_>=0 && targetsize_>=0 && weightsize_>=0) 00258 { 00259 00260 00261 // init. basic vars 00262 input = Var(inputsize(), "input"); 00263 output = input; 00264 params.resize(0); 00265 Var hidden_layer; 00266 00267 // first hidden layer 00268 if(nhidden>0) 00269 { 00270 w1 = Var(1+inputsize(), nhidden, "w1"); 00271 hidden_layer = affine_transform(output,w1); 00272 params.append(w1); 00273 if(hidden_transfer_func=="linear") 00274 output = hidden_layer; 00275 else if(hidden_transfer_func=="tanh") 00276 output = tanh(hidden_layer); 00277 else if(hidden_transfer_func=="sigmoid") 00278 output = sigmoid(hidden_layer); 00279 else if(hidden_transfer_func=="softplus") 00280 output = softplus(hidden_layer); 00281 else if(hidden_transfer_func=="exp") 00282 output = exp(hidden_layer); 00283 else if(hidden_transfer_func=="softmax") 00284 output = softmax(hidden_layer); 00285 else if (hidden_transfer_func == "log_softmax") 00286 output = log_softmax(output); 00287 else if(hidden_transfer_func=="hard_slope") 00288 output = unary_hard_slope(hidden_layer,0,1); 00289 else if(hidden_transfer_func=="symm_hard_slope") 00290 output = unary_hard_slope(hidden_layer,-1,1); 00291 else 00292 PLERROR("In NNet::build_() unknown hidden_transfer_func option: %s",hidden_transfer_func.c_str()); 00293 } 00294 00295 // second hidden layer 00296 if(nhidden2>0) 00297 { 00298 w2 = Var(1+nhidden, nhidden2, "w2"); 00299 output = affine_transform(output,w2); 00300 params.append(w2); 00301 if(hidden_transfer_func=="linear") 00302 output = output; 00303 else if(hidden_transfer_func=="tanh") 00304 output = tanh(output); 00305 else if(hidden_transfer_func=="sigmoid") 00306 output = sigmoid(output); 00307 else if(hidden_transfer_func=="softplus") 00308 output = softplus(output); 00309 else if(hidden_transfer_func=="exp") 00310 output = exp(output); 00311 else if(hidden_transfer_func=="softmax") 00312 output = softmax(output); 00313 else if (hidden_transfer_func == "log_softmax") 00314 output = log_softmax(output); 00315 else if(hidden_transfer_func=="hard_slope") 00316 output = unary_hard_slope(output,0,1); 00317 else if(hidden_transfer_func=="symm_hard_slope") 00318 output = unary_hard_slope(output,-1,1); 00319 else 00320 PLERROR("In NNet::build_() unknown hidden_transfer_func option: %s",hidden_transfer_func.c_str()); 00321 } 00322 00323 if (nhidden2>0 && nhidden==0) 00324 PLERROR("NNet:: can't have nhidden2 (=%d) > 0 while nhidden=0",nhidden2); 00325 00326 if (rbf_layer_size>0) 00327 { 00328 if (first_class_is_junk) 00329 { 00330 rbf_centers = Var(outputsize()-1, rbf_layer_size, "rbf_centers"); 00331 rbf_sigmas = Var(outputsize()-1, "rbf_sigmas"); 00332 PLERROR("In NNet.cc, the code needs to be completed, rbf_layer isn't declared and thus it doesn't compile with the line below"); 00333 // TODO (Also put back the corresponding include). 00334 // output = hconcat(rbf_layer(output,rbf_centers,rbf_sigmas)&junk_prob); 00335 params.append(junk_prob); 00336 } 00337 else 00338 { 00339 rbf_centers = Var(outputsize(), rbf_layer_size, "rbf_centers"); 00340 rbf_sigmas = Var(outputsize(), "rbf_sigmas"); 00341 PLERROR("In NNet.cc, the code needs to be completed, rbf_layer isn't declared and thus it doesn't compile with the line below"); 00342 // output = rbf_layer(output,rbf_centers,rbf_sigmas); 00343 } 00344 params.append(rbf_centers); 00345 params.append(rbf_sigmas); 00346 } 00347 00348 // output layer before transfer function 00349 wout = Var(1+output->size(), outputsize(), "wout"); 00350 output = affine_transform(output,wout); 00351 if (!fixed_output_weights) 00352 params.append(wout); 00353 else 00354 { 00355 outbias = Var(output->size(),"outbias"); 00356 output = output + outbias; 00357 params.append(outbias); 00358 } 00359 00360 // direct in-to-out layer 00361 if(direct_in_to_out) 00362 { 00363 wdirect = Var(inputsize(), outputsize(), "wdirect");// Var(1+inputsize(), outputsize(), "wdirect"); 00364 output += transposeProduct(wdirect, input);// affine_transform(input,wdirect); 00365 params.append(wdirect); 00366 } 00367 00368 Var before_transfer_func = output; 00369 00370 /* 00371 * output_transfer_func 00372 */ 00373 size_t p=0; 00374 if(output_transfer_func!="" && output_transfer_func!="none") 00375 { 00376 if(output_transfer_func=="tanh") 00377 output = tanh(output); 00378 else if(output_transfer_func=="sigmoid") 00379 output = sigmoid(output); 00380 else if(output_transfer_func=="softplus") 00381 output = softplus(output); 00382 else if(output_transfer_func=="exp") 00383 output = exp(output); 00384 else if(output_transfer_func=="softmax") 00385 output = softmax(output); 00386 else if (output_transfer_func == "log_softmax") 00387 output = log_softmax(output); 00388 else if ((p=output_transfer_func.find("interval"))!=string::npos) 00389 { 00390 size_t q = output_transfer_func.find(","); 00391 interval_minval = atof(output_transfer_func.substr(p+1,q-(p+1)).c_str()); 00392 size_t r = output_transfer_func.find(")"); 00393 interval_maxval = atof(output_transfer_func.substr(q+1,r-(q+1)).c_str()); 00394 output = interval_minval + (interval_maxval - interval_minval)*sigmoid(output); 00395 } 00396 else 00397 PLERROR("In NNet::build_() unknown output_transfer_func option: %s",output_transfer_func.c_str()); 00398 } 00399 00400 /* 00401 * target and weights 00402 */ 00403 00404 target = Var(targetsize(), "target"); 00405 00406 if(weightsize_>0) 00407 { 00408 if (weightsize_!=1) 00409 PLERROR("NNet: expected weightsize to be 1 or 0 (or unspecified = -1, meaning 0), got %d",weightsize_); 00410 sampleweight = Var(1, "weight"); 00411 } 00412 /* 00413 * costfuncs 00414 */ 00415 int ncosts = cost_funcs.size(); 00416 if(ncosts<=0) 00417 PLERROR("In NNet::build_() Empty cost_funcs : must at least specify the cost function to optimize!"); 00418 costs.resize(ncosts); 00419 00420 for(int k=0; k<ncosts; k++) 00421 { 00422 // create costfuncs and apply individual weights if weightpart > 1 00423 if(cost_funcs[k]=="mse") 00424 costs[k]= sumsquare(output-target); 00425 else if(cost_funcs[k]=="mse_onehot") 00426 costs[k] = onehot_squared_loss(output, target); 00427 else if(cost_funcs[k]=="NLL") 00428 { 00429 if (output->size() == 1) { 00430 // Assume sigmoid output here! 00431 costs[k] = cross_entropy(output, target); 00432 } else { 00433 if (output_transfer_func == "log_softmax") 00434 costs[k] = -output[target]; 00435 else 00436 costs[k] = neg_log_pi(output, target); 00437 } 00438 } 00439 else if(cost_funcs[k]=="class_error") 00440 costs[k] = classification_loss(output, target); 00441 else if(cost_funcs[k]=="binary_class_error") 00442 costs[k] = binary_classification_loss(output, target); 00443 else if(cost_funcs[k]=="multiclass_error") 00444 costs[k] = multiclass_loss(output, target); 00445 else if(cost_funcs[k]=="cross_entropy") 00446 costs[k] = cross_entropy(output, target); 00447 else if (cost_funcs[k]=="stable_cross_entropy") { 00448 Var c = stable_cross_entropy(before_transfer_func, target); 00449 costs[k] = c; 00450 if (classification_regularizer) { 00451 // There is a regularizer to add to the cost function. 00452 dynamic_cast<NegCrossEntropySigmoidVariable*>((Variable*) c)-> 00453 setRegularizer(classification_regularizer); 00454 } 00455 } 00456 else if (cost_funcs[k]=="margin_perceptron_cost") 00457 costs[k] = margin_perceptron_cost(output,target,margin); 00458 else if (cost_funcs[k]=="lift_output") 00459 costs[k] = lift_output(output, target); 00460 else // Assume we got a Variable name and its options 00461 { 00462 costs[k]= dynamic_cast<Variable*>(newObject(cost_funcs[k])); 00463 if(costs[k].isNull()) 00464 PLERROR("In NNet::build_() unknown cost_func option: %s",cost_funcs[k].c_str()); 00465 costs[k]->setParents(output & target); 00466 costs[k]->build(); 00467 } 00468 00469 // take into account the sampleweight 00470 //if(sampleweight) 00471 // costs[k]= costs[k] * sampleweight; // NO, because this is taken into account (more properly) in stats->update 00472 } 00473 00474 00475 /* 00476 * weight and bias decay penalty 00477 */ 00478 00479 // create penalties 00480 penalties.resize(0); // prevents penalties from being added twice by consecutive builds 00481 if(w1 && ((layer1_weight_decay + weight_decay)!=0 || (layer1_bias_decay + bias_decay)!=0)) 00482 penalties.append(affine_transform_weight_penalty(w1, (layer1_weight_decay + weight_decay), (layer1_bias_decay + bias_decay), L1_penalty)); 00483 if(w2 && ((layer2_weight_decay + weight_decay)!=0 || (layer2_bias_decay + bias_decay)!=0)) 00484 penalties.append(affine_transform_weight_penalty(w2, (layer2_weight_decay + weight_decay), (layer2_bias_decay + bias_decay), L1_penalty)); 00485 if(wout && ((output_layer_weight_decay + weight_decay)!=0 || (output_layer_bias_decay + bias_decay)!=0)) 00486 penalties.append(affine_transform_weight_penalty(wout, (output_layer_weight_decay + weight_decay), 00487 (output_layer_bias_decay + bias_decay), L1_penalty)); 00488 if(wdirect && (direct_in_to_out_weight_decay + weight_decay) != 0) 00489 { 00490 if (L1_penalty) 00491 penalties.append(sumabs(wdirect)*(direct_in_to_out_weight_decay + weight_decay)); 00492 else 00493 penalties.append(sumsquare(wdirect)*(direct_in_to_out_weight_decay + weight_decay)); 00494 } 00495 00496 if (input_reconstruction_penalty>0) 00497 { 00498 wrec = Var(hidden_layer->size(),inputsize(),"wrec"); 00499 predicted_input = transposeProduct(wrec, hidden_layer); 00500 params.append(wrec); 00501 penalties.append(input_reconstruction_penalty*sumsquare(predicted_input - input)); 00502 } 00503 00504 test_costs = hconcat(costs); 00505 00506 // Apply penalty to cost. 00507 // If there is no penalty, we still add costs[0] as the first cost, in 00508 // order to keep the same number of costs as if there was a penalty. 00509 if(penalties.size() != 0) { 00510 if (weightsize_>0) 00511 // only multiply by sampleweight if there are weights 00512 training_cost = hconcat(sampleweight*sum(hconcat(costs[0] & penalties)) 00513 & (test_costs*sampleweight)); 00514 else { 00515 training_cost = hconcat(sum(hconcat(costs[0] & penalties)) & test_costs); 00516 } 00517 } 00518 else { 00519 if(weightsize_>0) { 00520 // only multiply by sampleweight if there are weights 00521 training_cost = hconcat(costs[0]*sampleweight & test_costs*sampleweight); 00522 } else { 00523 training_cost = hconcat(costs[0] & test_costs); 00524 } 00525 } 00526 00527 training_cost->setName("training_cost"); 00528 test_costs->setName("test_costs"); 00529 output->setName("output"); 00530 00531 // Shared values hack... 00532 if((bool)paramsvalues && (paramsvalues.size() == params.nelems())) 00533 params << paramsvalues; 00534 else 00535 { 00536 paramsvalues.resize(params.nelems()); 00537 initializeParams(); 00538 } 00539 params.makeSharedValue(paramsvalues); 00540 00541 // Funcs 00542 invars.resize(0); 00543 VarArray outvars; 00544 VarArray testinvars; 00545 if(input) 00546 { 00547 invars.push_back(input); 00548 testinvars.push_back(input); 00549 } 00550 if(output) 00551 outvars.push_back(output); 00552 if(target) 00553 { 00554 invars.push_back(target); 00555 testinvars.push_back(target); 00556 outvars.push_back(target); 00557 } 00558 if(sampleweight) 00559 { 00560 invars.push_back(sampleweight); 00561 } 00562 00563 f = Func(input, output); 00564 test_costf = Func(testinvars, output&test_costs); 00565 test_costf->recomputeParents(); 00566 output_and_target_to_cost = Func(outvars, test_costs); 00567 output_and_target_to_cost->recomputeParents(); 00568 } 00569 } 00570 00571 int NNet::outputsize() const 00572 { return noutputs; } 00573 00574 TVec<string> NNet::getTrainCostNames() const 00575 { 00576 return (cost_funcs[0]+"+penalty") & cost_funcs; 00577 } 00578 00579 TVec<string> NNet::getTestCostNames() const 00580 { 00581 return cost_funcs; 00582 } 00583 00584 00585 void NNet::train() 00586 { 00587 // NNet nstages is number of epochs (whole passages through the training set) 00588 // while optimizer nstages is number of weight updates. 00589 // So relationship between the 2 depends whether we are in stochastic, batch or minibatch mode 00590 00591 if(!train_set) 00592 PLERROR("In NNet::train, you did not setTrainingSet"); 00593 00594 if(!train_stats) 00595 PLERROR("In NNet::train, you did not setTrainStatsCollector"); 00596 00597 int l = train_set->length(); 00598 00599 if(f.isNull()) // Net has not been properly built yet (because build was called before the learner had a proper training set) 00600 build(); 00601 00602 // number of samples seen by optimizer before each optimizer update 00603 int nsamples = batch_size>0 ? batch_size : l; 00604 Func paramf = Func(invars, training_cost); // parameterized function to optimize 00605 Var totalcost = meanOf(train_set, paramf, nsamples); 00606 if(optimizer) 00607 { 00608 optimizer->setToOptimize(params, totalcost); 00609 optimizer->build(); 00610 } 00611 else PLERROR("RecommandationNet::train can't train without setting an optimizer first!"); 00612 00613 // number of optimizer stages corresponding to one learner stage (one epoch) 00614 int optstage_per_lstage = l/nsamples; 00615 00616 ProgressBar* pb = 0; 00617 if(report_progress) 00618 pb = new ProgressBar("Training NNet from stage " + tostring(stage) + " to " + tostring(nstages), nstages-stage); 00619 00620 int initial_stage = stage; 00621 bool early_stop=false; 00622 while(stage<nstages && !early_stop) 00623 { 00624 optimizer->nstages = optstage_per_lstage; 00625 train_stats->forget(); 00626 optimizer->early_stop = false; 00627 optimizer->optimizeN(*train_stats); 00628 train_stats->finalize(); 00629 if(verbosity>2) 00630 cout << "Epoch " << stage << " train objective: " << train_stats->getMean() << endl; 00631 ++stage; 00632 if(pb) 00633 pb->update(stage-initial_stage); 00634 } 00635 if(verbosity>1) 00636 cout << "EPOCH " << stage << " train objective: " << train_stats->getMean() << endl; 00637 00638 if(pb) 00639 delete pb; 00640 00641 output_and_target_to_cost->recomputeParents(); 00642 test_costf->recomputeParents(); 00643 // cerr << "totalcost->value = " << totalcost->value << endl; 00644 // cout << "Result for benchmark is: " << totalcost->value << endl; 00645 } 00646 00647 00648 00649 void NNet::computeOutput(const Vec& inputv, Vec& outputv) const 00650 { 00651 f->fprop(inputv,outputv); 00652 } 00653 00654 void NNet::computeOutputAndCosts(const Vec& inputv, const Vec& targetv, 00655 Vec& outputv, Vec& costsv) const 00656 { 00657 test_costf->fprop(inputv&targetv, outputv&costsv); 00658 } 00659 00660 00661 void NNet::computeCostsFromOutputs(const Vec& inputv, const Vec& outputv, 00662 const Vec& targetv, Vec& costsv) const 00663 { 00664 output_and_target_to_cost->fprop(outputv&targetv, costsv); 00665 } 00666 00667 void NNet::initializeParams() 00668 { 00669 if (seed_>=0) 00670 manual_seed(seed_); 00671 else 00672 PLearn::seed(); 00673 00674 //real delta = 1./sqrt(inputsize()); 00675 real delta = 0; 00676 if (initialization_method == "normal_linear") { 00677 delta = 1.0 / inputsize(); 00678 } else if (initialization_method == "normal_sqrt") { 00679 delta = 1.0 / sqrt(real(inputsize())); 00680 } else { 00681 PLERROR("In NNet::initializeParams - Unknown value for 'initialization_method'"); 00682 } 00683 00684 /* 00685 if(direct_in_to_out) 00686 { 00687 fill_random_uniform(wdirect->value, -delta, +delta); 00688 //fill_random_normal(wdirect->value, 0, delta); 00689 //wdirect->matValue(0).clear(); 00690 } 00691 */ 00692 if(nhidden>0) 00693 { 00694 fill_random_uniform(w1->value, -delta, +delta); 00695 //fill_random_normal(w1->value, 0, delta); 00696 if(direct_in_to_out) 00697 { 00698 fill_random_uniform(wdirect->value, -delta, +delta); 00699 //fill_random_normal(wdirect->value, 0, 0.01*delta); 00700 wdirect->matValue(0).clear(); 00701 } 00702 if (initialization_method == "normal_linear") { 00703 delta = 1.0 / real(nhidden); 00704 } else if (initialization_method == "normal_sqrt") { 00705 delta = 1.0 / sqrt(real(nhidden)); 00706 } 00707 w1->matValue(0).clear(); 00708 } 00709 if(nhidden2>0) 00710 { 00711 fill_random_uniform(w2->value, -delta, +delta); 00712 //fill_random_normal(w2->value, 0, delta); 00713 if (initialization_method == "normal_linear") { 00714 delta = 1.0 / real(nhidden2); 00715 } else if (initialization_method == "normal_sqrt") { 00716 delta = 1.0 / sqrt(real(nhidden2)); 00717 } 00718 w2->matValue(0).clear(); 00719 } 00720 if (fixed_output_weights) 00721 { 00722 static Vec values; 00723 if (values.size()==0) 00724 { 00725 values.resize(2); 00726 values[0]=-1; 00727 values[1]=1; 00728 } 00729 fill_random_discrete(wout->value, values); 00730 wout->matValue(0).clear(); 00731 } 00732 else 00733 { 00734 fill_random_uniform(wout->value, -delta, +delta); 00735 //fill_random_normal(wout->value, 0, delta); 00736 wout->matValue(0).clear(); 00737 } 00738 00739 // Reset optimizer 00740 if(optimizer) 00741 optimizer->reset(); 00742 } 00743 00744 void NNet::forget() 00745 { 00746 if (train_set) initializeParams(); 00747 stage = 0; 00748 } 00749 00751 extern void varDeepCopyField(Var& field, CopiesMap& copies); 00752 00753 void NNet::makeDeepCopyFromShallowCopy(CopiesMap& copies) 00754 { 00755 inherited::makeDeepCopyFromShallowCopy(copies); 00756 varDeepCopyField(input, copies); 00757 varDeepCopyField(target, copies); 00758 varDeepCopyField(sampleweight, copies); 00759 varDeepCopyField(w1, copies); 00760 varDeepCopyField(w2, copies); 00761 varDeepCopyField(wout, copies); 00762 varDeepCopyField(outbias, copies); 00763 varDeepCopyField(wdirect, copies); 00764 varDeepCopyField(wrec, copies); 00765 varDeepCopyField(rbf_centers, copies); 00766 varDeepCopyField(rbf_sigmas, copies); 00767 varDeepCopyField(junk_prob, copies); 00768 varDeepCopyField(output, copies); 00769 varDeepCopyField(predicted_input, copies); 00770 deepCopyField(costs, copies); 00771 deepCopyField(penalties, copies); 00772 varDeepCopyField(training_cost, copies); 00773 varDeepCopyField(test_costs, copies); 00774 deepCopyField(invars, copies); 00775 deepCopyField(params, copies); 00776 deepCopyField(paramsvalues, copies); 00777 deepCopyField(f, copies); 00778 deepCopyField(test_costf, copies); 00779 deepCopyField(output_and_target_to_cost, copies); 00780 deepCopyField(optimizer, copies); 00781 } 00782 00783 } // end of namespace PLearn