Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

ConditionalDensityNet.cc

Go to the documentation of this file.
00001 // -*- C++ -*- 00002 00003 // ConditionalDensityNet.cc 00004 // 00005 // Copyright (C) 2003 Yoshua Bengio 00006 // 00007 // Redistribution and use in source and binary forms, with or without 00008 // modification, are permitted provided that the following conditions are met: 00009 // 00010 // 1. Redistributions of source code must retain the above copyright 00011 // notice, this list of conditions and the following disclaimer. 00012 // 00013 // 2. Redistributions in binary form must reproduce the above copyright 00014 // notice, this list of conditions and the following disclaimer in the 00015 // documentation and/or other materials provided with the distribution. 00016 // 00017 // 3. The name of the authors may not be used to endorse or promote 00018 // products derived from this software without specific prior written 00019 // permission. 00020 // 00021 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00022 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00023 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00024 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00025 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00026 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00027 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00028 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00029 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00030 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00031 // 00032 // This file is part of the PLearn library. For more information on the PLearn 00033 // library, go to the PLearn Web site at www.plearn.org 00034 00035 /* ******************************************************* 00036 * $Id: ConditionalDensityNet.cc,v 1.45 2004/07/21 16:30:55 chrish42 Exp $ 00037 ******************************************************* */ 00038 00039 // Authors: Yoshua Bengio 00040 00043 #include <plearn/var/AffineTransformVariable.h> 00044 #include <plearn/var/AffineTransformWeightPenalty.h> 00045 #include "ConditionalDensityNet.h" 00046 #include <plearn/var/ConcatColumnsVariable.h> 00047 #include <plearn/var/ConcatRowsVariable.h> 00048 #include <plearn/var/CutBelowThresholdVariable.h> 00049 #include <plearn/display/DisplayUtils.h> 00050 #include <plearn/var/DotProductVariable.h> 00051 #include <plearn/var/IfThenElseVariable.h> 00052 #include <plearn/var/IsAboveThresholdVariable.h> 00053 #include <plearn/var/LogVariable.h> 00054 //#include "DilogarithmVariable.h" 00055 #include <plearn/var/SoftSlopeVariable.h> 00056 #include <plearn/var/SoftSlopeIntegralVariable.h> 00057 //#include "plapack.h" 00058 #include <plearn/var/SoftplusVariable.h> 00059 #include <plearn/var/SubMatTransposeVariable.h> 00060 #include <plearn/var/SubMatVariable.h> 00061 #include <plearn/var/SumAbsVariable.h> 00062 #include <plearn/var/SumOfVariable.h> 00063 #include <plearn/var/SumSquareVariable.h> 00064 #include <plearn/var/SumVariable.h> 00065 #include <plearn/var/TanhVariable.h> 00066 #include <plearn/var/TransposeProductVariable.h> 00067 #include <plearn/math/random.h> 00068 00069 namespace PLearn { 00070 using namespace std; 00071 00072 ConditionalDensityNet::ConditionalDensityNet() 00073 : nhidden(0), 00074 nhidden2(0), 00075 weight_decay(0), 00076 bias_decay(1e-6), 00077 layer1_weight_decay(0), 00078 layer1_bias_decay(0), 00079 layer2_weight_decay(0), 00080 layer2_bias_decay(0), 00081 output_layer_weight_decay(0), 00082 output_layer_bias_decay(0), 00083 direct_in_to_out_weight_decay(0), 00084 L1_penalty(false), 00085 direct_in_to_out(false), 00086 batch_size(1), 00087 c_penalization(0), 00088 maxY(1), // if Y is normalized to be in interval [0,1], that would be OK 00089 thresholdY(0.1), 00090 log_likelihood_vs_squared_error_balance(1), 00091 separate_mass_point(1), 00092 n_output_density_terms(0), 00093 generate_precision(1e-3), 00094 steps_type("sloped_steps"), 00095 centers_initialization("data"), 00096 curve_positions("uniform"), 00097 scale(5.0), 00098 unconditional_p0(0.01), 00099 mu_is_fixed(true), 00100 initial_hardness(1) 00101 {} 00102 00103 PLEARN_IMPLEMENT_OBJECT(ConditionalDensityNet, "Neural Network that Implements a Positive Random Variable Conditional Density", 00104 "The input vector is used to compute parameters of an output density or output\n" 00105 "cumulative distribution as well as output expected value. The ASSUMPTIONS\n" 00106 "on the generating distribution P(Y|X) are the following:\n" 00107 " * Y is a single real value\n" 00108 " * 0 <= Y <= maxY, with maxY a known finite value\n" 00109 " * the density has a mass point at Y=0\n" 00110 " * the density is continuous for Y>0\n" 00111 "The form of the conditional cumulative of Y is the following (separate_mass_points=false):\n" 00112 " P(Y<=y|theta) = (1/Z) (s(a) + sum_i u_i s(b_i) g(y,theta,i))\n" 00113 "or for separate_mass_point=true:\n" 00114 " P(Y<=y|theta) = sigmoid(a) + (1-sigmoid(a))(sum_i u_i s(b_i) (g(y,theta,i)-g(0,theta,i))/Z\n" 00115 "where s(z)=log(1+exp(z)) is the softplus function, and g is a monotonic function\n" 00116 "in y whose first derivative and indefinite integral are known analytically.\n" 00117 "The u_i are fixed from the unconditional distribution, such that s(b_i)=1 gives\n" 00118 "approximately the right unconditional cumulative function (for infinite hardness):\n" 00119 " u_i = P(mu_{i-1}<Y<=mu_i) [unconditional].\n" 00120 "The parameters theta of Y's distribution are (a,b_1,b_2,...,c_1,c_2,...,mu_1,mu_2,...),\n" 00121 "which are obtained as the unconstrained outputs (no output transfer function) of a neural network.\n" 00122 "The normalization constant Z is computed analytically easily: (separate_mass_point=false)\n" 00123 " Z = s(a) + sum_i u_i s(b_i) g(y,theta,i)\n" 00124 "or for separate_mass_point=true:\n" 00125 " Z = sum_i s(b_i) (g(y,theta,i)-g(0,theta,i))\n" 00126 "The current implementation considers two choices for g:\n" 00127 " - sigmoid_steps: g(y,theta,i) = sigmoid(h*s(c_i)*(y-mu_i)/(mu_{i+1}-mu_i))\n" 00128 " - sloped_steps: g(y,theta,i) = 1 + s(s(c_i)*(mu_i-y))-s(s(c_i)*(mu_{i+1}-y))/(s(c_i)*(mu_{i+1}-mu_i))\n" 00129 "where h is the 'initial_hardness' option.\n" 00130 "The density is analytically obtained using the derivative g' of g and\n" 00131 "expected value is analytically obtained using the primitive G of g.\n" 00132 "For the mass point at the origin,\n" 00133 " P(Y=0|theta) = P(Y<=0|theta).\n" 00134 "(which is simply sigmoid(a) if separate_mass_point).\n" 00135 "For positive values of Y: (separate_mass_point=false)\n" 00136 " p(y|theta) = (1/Z) sum_i s(b_i) g'(y,theta,i).\n" 00137 "or for separate_mass_point=true:\n" 00138 " p(y|theta) = (1-sigmoid(a)) (1/Z) sum_i s(b_i) g'(y,theta,i).\n" 00139 "And the expected value of Y is obtained using the primitive: (separate_mass_point=false)\n" 00140 " E[Y|theta] = (1/Z)*s(a)*M + sum_i u_i s(b_i)(G(M,theta,i)-G(0,theta,i)))\n" 00141 "or for separate_mass_point=true:\n" 00142 " E[Y|theta] = M - ((sigmoid(a)-(1-sigmoid(a)*(1/Z)*sum_i u_i s(b_i)g(0,theta,i))*M + (1-sigmoid(a))*(1/Z)*sum_i u_i s(b_i)(G(M,theta,i)-G(0,theta,0)))\n" 00143 "Training the model can be done by maximum likelihood (minimizing the log of the\n" 00144 "density) or by minimizing the average of squared error (y-E[Y|theta])^2\n" 00145 "or a combination of the two (with the max_likelihood_vs_squared_error_balance option).\n" 00146 "The step 'centers' mu_i are initialized according to some rule, in the interval [0,maxY]:\n" 00147 " - uniform: at regular intervals in [0,maxY]\n" 00148 " - log-scale: as the exponential of values at regular intervals in [0,log(1+maxY)], minus 1.\n" 00149 "The c_i and b_i are initialized to inverse_softplus(1), and a using the empirical unconditional P(Y=0).\n" 00150 "For the output curve options (outputs_def='L',D','C', or 'S'), the lower_bound and upper_bound\n" 00151 "options of PDistribution are automatically set to 0 and maxY respectively.\n" 00152 ); 00153 00154 void ConditionalDensityNet::declareOptions(OptionList& ol) 00155 { 00156 declareOption(ol, "nhidden", &ConditionalDensityNet::nhidden, OptionBase::buildoption, 00157 " number of hidden units in first hidden layer (0 means no hidden layer)\n"); 00158 00159 declareOption(ol, "nhidden2", &ConditionalDensityNet::nhidden2, OptionBase::buildoption, 00160 " number of hidden units in second hidden layer (0 means no hidden layer)\n"); 00161 00162 declareOption(ol, "weight_decay", &ConditionalDensityNet::weight_decay, OptionBase::buildoption, 00163 " global weight decay for all layers\n"); 00164 00165 declareOption(ol, "bias_decay", &ConditionalDensityNet::bias_decay, OptionBase::buildoption, 00166 " global bias decay for all layers\n"); 00167 00168 declareOption(ol, "layer1_weight_decay", &ConditionalDensityNet::layer1_weight_decay, OptionBase::buildoption, 00169 " Additional weight decay for the first hidden layer. Is added to weight_decay.\n"); 00170 declareOption(ol, "layer1_bias_decay", &ConditionalDensityNet::layer1_bias_decay, OptionBase::buildoption, 00171 " Additional bias decay for the first hidden layer. Is added to bias_decay.\n"); 00172 00173 declareOption(ol, "layer2_weight_decay", &ConditionalDensityNet::layer2_weight_decay, OptionBase::buildoption, 00174 " Additional weight decay for the second hidden layer. Is added to weight_decay.\n"); 00175 00176 declareOption(ol, "layer2_bias_decay", &ConditionalDensityNet::layer2_bias_decay, OptionBase::buildoption, 00177 " Additional bias decay for the second hidden layer. Is added to bias_decay.\n"); 00178 00179 declareOption(ol, "output_layer_weight_decay", &ConditionalDensityNet::output_layer_weight_decay, OptionBase::buildoption, 00180 " Additional weight decay for the output layer. Is added to 'weight_decay'.\n"); 00181 00182 declareOption(ol, "output_layer_bias_decay", &ConditionalDensityNet::output_layer_bias_decay, OptionBase::buildoption, 00183 " Additional bias decay for the output layer. Is added to 'bias_decay'.\n"); 00184 00185 declareOption(ol, "direct_in_to_out_weight_decay", &ConditionalDensityNet::direct_in_to_out_weight_decay, OptionBase::buildoption, 00186 " Additional weight decay for the direct in-to-out layer. Is added to 'weight_decay'.\n"); 00187 00188 declareOption(ol, "L1_penalty", &ConditionalDensityNet::L1_penalty, OptionBase::buildoption, 00189 " should we use L1 penalty instead of the default L2 penalty on the weights? (default=0)\n"); 00190 00191 declareOption(ol, "direct_in_to_out", &ConditionalDensityNet::direct_in_to_out, OptionBase::buildoption, 00192 " should we include direct input to output connections? (default=0)\n"); 00193 00194 declareOption(ol, "optimizer", &ConditionalDensityNet::optimizer, OptionBase::buildoption, 00195 " specify the optimizer to use\n"); 00196 00197 declareOption(ol, "batch_size", &ConditionalDensityNet::batch_size, OptionBase::buildoption, 00198 " how many samples to use to estimate the avergage gradient before updating the weights\n" 00199 " 0 is equivalent to specifying training_set->length(); default=1 (stochastic gradient)\n"); 00200 00201 declareOption(ol, "maxY", &ConditionalDensityNet::maxY, OptionBase::buildoption, 00202 " maximum allowed value for Y. Default = 1.0 (data normalized in [0,1]\n"); 00203 00204 declareOption(ol, "thresholdY", &ConditionalDensityNet::thresholdY, OptionBase::buildoption, 00205 " threshold value of Y for which we might want to compute P(Y>thresholdY), with outputs_def='t'\n"); 00206 00207 declareOption(ol, "log_likelihood_vs_squared_error_balance", &ConditionalDensityNet::log_likelihood_vs_squared_error_balance, 00208 OptionBase::buildoption, 00209 " Relative weight given to negative log-likelihood (1- this weight given squared error). Default=1\n"); 00210 00211 declareOption(ol, "n_output_density_terms", &ConditionalDensityNet::n_output_density_terms, 00212 OptionBase::buildoption, 00213 " Number of terms (steps) in the output density function.\n"); 00214 00215 declareOption(ol, "steps_type", &ConditionalDensityNet::steps_type, 00216 OptionBase::buildoption, 00217 " The type of steps used to build the cumulative distribution.\n" 00218 " Allowed values are:\n" 00219 " - sigmoid_steps: g(y,theta,i) = sigmoid(s(c_i)*(y-mu_i))\n" 00220 " - sloped_steps: g(y,theta,i) = s(s(c_i)*(mu_i-y))-s(s(c_i)*(mu_i-y))\nDefault=sloped_steps\n"); 00221 00222 declareOption(ol, "centers_initialization", &ConditionalDensityNet::centers_initialization, 00223 OptionBase::buildoption, 00224 " How to initialize the step centers (mu_i). Allowed values are:\n" 00225 " - data: from the data at regular quantiles, with last one at maxY (default)\n" 00226 " - uniform: at regular intervals in [0,maxY]\n" 00227 " - log-scale: as the exponential of values at regular intervals in log-scale, using formula:\n" 00228 " i-th position = (exp(scale*(i+1-n_output_density_terms)/n_output_density_terms)-exp(-scale))/(1-exp(-scale))\n"); 00229 declareOption(ol, "curve_positions", &ConditionalDensityNet::curve_positions, 00230 OptionBase::buildoption, 00231 " How to choose the y-values for the probability curve (upper case output_def):\n" 00232 " - uniform: at regular intervals in [0,maxY]\n" 00233 " - log-scale: as the exponential of values at regular intervals in log-scale, using formula:\n" 00234 " i-th position = (exp(scale*(i+1-n_output_density_terms)/n_output_density_terms)-exp(-scale))/(1-exp(-scale))\n"); 00235 declareOption(ol, "scale", &ConditionalDensityNet::scale, 00236 OptionBase::buildoption, 00237 " scale used in the log-scale formula for centers_initialization and curve_positions"); 00238 00239 declareOption(ol, "unconditional_p0", &ConditionalDensityNet::unconditional_p0, OptionBase::buildoption, 00240 " approximate unconditional probability of Y=0 (mass point), used\n" 00241 " to initialize the parameters.\n"); 00242 00243 declareOption(ol, "mu_is_fixed", &ConditionalDensityNet::mu_is_fixed, OptionBase::buildoption, 00244 " whether to keep the step centers (mu[i]) fixed or to learn them.\n"); 00245 00246 declareOption(ol, "separate_mass_point", &ConditionalDensityNet::separate_mass_point, OptionBase::buildoption, 00247 " whether to model separately the mass point at the origin.\n"); 00248 00249 declareOption(ol, "initial_hardness", &ConditionalDensityNet::initial_hardness, OptionBase::buildoption, 00250 " value that scales softplus(c).\n"); 00251 00252 declareOption(ol, "c_penalization", &ConditionalDensityNet::c_penalization, OptionBase::buildoption, 00253 " the penalization coefficient for the 'c' output of the neural network"); 00254 00255 declareOption(ol, "generate_precision", &ConditionalDensityNet::generate_precision, OptionBase::buildoption, 00256 " precision when generating a new sample\n"); 00257 00258 declareOption(ol, "paramsvalues", &ConditionalDensityNet::paramsvalues, OptionBase::learntoption, 00259 " The learned neural network parameter vector\n"); 00260 00261 declareOption(ol, "unconditional_cdf", &ConditionalDensityNet::unconditional_cdf, OptionBase::learntoption, 00262 " Unconditional cumulative distribution function.\n"); 00263 00264 declareOption(ol, "unconditional_delta_cdf", &ConditionalDensityNet::unconditional_delta_cdf, OptionBase::learntoption, 00265 " Variations of the cdf from one step center to the next (this is u_i in above eqns).\n"); 00266 00267 declareOption(ol, "mu", &ConditionalDensityNet::mu, OptionBase::learntoption, 00268 " Step centers.\n"); 00269 00270 declareOption(ol, "y_values", &ConditionalDensityNet::y_values, OptionBase::learntoption, 00271 " Values of Y at which the cumulative (or density or survival) curves are computed if required.\n"); 00272 00273 inherited::declareOptions(ol); 00274 } 00275 00276 /* 00277 int ConditionalDensityNet::outputsize() const 00278 { 00279 int target_size = targetsize_<0?(train_set?train_set->targetsize():1):targetsize_; 00280 int l=0; 00281 for (unsigned int i=0;i<outputs_def.length();i++) 00282 if (outputs_def[i]=='L' || outputs_def[i]=='D' || outputs_def[i]=='C' || outputs_def[i]=='S') 00283 l+=n_curve_points; 00284 else if (outputs_def[i]=='e') 00285 l+=target_size; 00286 else if (outputs_def[i]=='v') // by default assume variance is full nxn matrix 00287 l+=target_size*target_size; 00288 else l++; 00289 return l; 00290 } 00291 */ 00292 00294 // build_ // 00296 void ConditionalDensityNet::build_() 00297 { 00298 if(inputsize_>=0 && targetsize_>=0 && weightsize_>=0) 00299 { 00300 lower_bound = 0; 00301 upper_bound = maxY; 00302 int n_output_parameters = mu_is_fixed?(1+n_output_density_terms*2):(1+n_output_density_terms*3); 00303 00304 if (n_curve_points<0) 00305 n_curve_points = n_output_density_terms+1; 00306 00307 // init. basic vars 00308 input = Var(n_input, "input"); 00309 output = input; 00310 params.resize(0); 00311 00312 // first hidden layer 00313 if(nhidden>0) 00314 { 00315 w1 = Var(1+n_input, nhidden, "w1"); 00316 output = tanh(affine_transform(output,w1)); 00317 params.append(w1); 00318 } 00319 00320 // second hidden layer 00321 if(nhidden2>0) 00322 { 00323 w2 = Var(1+nhidden, nhidden2, "w2"); 00324 output = tanh(affine_transform(output,w2)); 00325 params.append(w2); 00326 } 00327 00328 if (nhidden2>0 && nhidden==0) 00329 PLERROR("ConditionalDensityNet:: can't have nhidden2 (=%d) > 0 while nhidden=0",nhidden2); 00330 00331 if (nhidden==-1) 00332 // special code meaning that the inputs should be ignored, only use biases 00333 { 00334 wout = Var(1, n_output_parameters, "wout"); 00335 output = transpose(wout); 00336 } 00337 // output layer before transfer function 00338 else 00339 { 00340 wout = Var(1+output->size(), n_output_parameters, "wout"); 00341 output = affine_transform(output,wout); 00342 } 00343 params.append(wout); 00344 00345 // direct in-to-out layer 00346 if(direct_in_to_out) 00347 { 00348 wdirect = Var(n_input, n_output_parameters, "wdirect"); 00349 //wdirect = Var(1+inputsize(), n_output_parameters, "wdirect"); 00350 output += transposeProduct(wdirect, input);// affine_transform(input,wdirect); 00351 params.append(wdirect); 00352 } 00353 00354 /* 00355 * target and weights 00356 */ 00357 00358 target = Var(n_target, "target"); 00359 00360 if(weightsize_>0) 00361 { 00362 if (weightsize_!=1) 00363 PLERROR("ConditionalDensityNet: expected weightsize to be 1 or 0 (or unspecified = -1, meaning 0), got %d",weightsize_); 00364 sampleweight = Var(1, "weight"); 00365 } 00366 // output = parameters of the Y distribution 00367 00368 int i=0; 00369 a = output[i++]; a->setName("a"); 00370 //b = new SubMatVariable(output,0,i,1,n_output_density_terms); 00371 b = new SubMatVariable(output,i,0,n_output_density_terms,1); 00372 b->setName("b"); 00373 i+=n_output_density_terms; 00374 //c = new SubMatVariable(output,0,i,1,n_output_density_terms); 00375 c = new SubMatVariable(output,i,0,n_output_density_terms,1); 00376 c->setName("c"); 00377 00378 // we don't want to clear mu if this build is called 00379 // just after a load(), because mu is a learnt option 00380 if (!mu || (mu->length()!=n_output_density_terms && train_set)) 00381 { 00382 if (mu_is_fixed) 00383 //mu = Var(1,n_output_density_terms); 00384 mu = Var(n_output_density_terms,1); 00385 else 00386 { 00387 i+=n_output_density_terms; 00388 //mu = new SubMatVariable(output,0,i,1,n_output_density_terms); 00389 mu = new SubMatVariable(output,i,0,n_output_density_terms,1); 00390 } 00391 } 00392 mu->setName("mu"); 00393 00394 /* 00395 * output density 00396 */ 00397 Var nll; // negative log likelihood 00398 Var max_y = var(maxY); 00399 Var left_side = vconcat(var(0.0) & (new SubMatVariable(mu,0,0,n_output_density_terms-1,1))); 00400 centers = target-mu; 00401 centers_M = max_y-mu; 00402 unconditional_cdf.resize(n_output_density_terms); 00403 if (unconditional_delta_cdf) 00404 { 00405 // don't clear it if this build is called just after a load 00406 if (unconditional_delta_cdf.length()!=n_output_density_terms) 00407 unconditional_delta_cdf->resize(n_output_density_terms,1); 00408 } 00409 else 00410 unconditional_delta_cdf = Var(n_output_density_terms,1); 00411 initial_hardnesses = var(initial_hardness) / (mu - left_side); 00412 pos_b = softplus(b)*unconditional_delta_cdf; 00413 pos_c = softplus(c)*initial_hardnesses; 00414 Var scaled_centers = pos_c*centers; 00415 // scaled centers evaluated at target = M 00416 Var scaled_centers_M = pos_c*centers_M; 00417 // scaled centers evaluated at target = 0 00418 Var scaled_centers_0 = -pos_c*mu; 00419 Var lhopital, inverse_denominator, density_numerator; 00420 if (separate_mass_point) 00421 { 00422 pos_a = sigmoid(a); 00423 if (steps_type=="sigmoid_steps") 00424 { 00425 steps = sigmoid(scaled_centers); 00426 // steps evaluated at target = M 00427 steps_M = sigmoid(scaled_centers_M); 00428 steps_0 = sigmoid(scaled_centers_0); 00429 // derivative of steps wrt target 00430 steps_gradient = pos_c*steps*(1-steps); 00431 steps_integral = (softplus(scaled_centers_M) - softplus(scaled_centers_0))/pos_c; 00432 delta_steps = centers_M*steps_M + mu*sigmoid(scaled_centers_0); 00433 } 00434 else if (steps_type=="sloped_steps") 00435 { 00436 steps = soft_slope(target, pos_c, left_side, mu); 00437 steps_M = soft_slope(max_y, pos_c, left_side, mu); 00438 steps_0 = soft_slope(var(0.0), pos_c, left_side, mu); 00439 steps_gradient = d_soft_slope(target, pos_c, left_side, mu); 00440 steps_integral = soft_slope_integral(pos_c,left_side,mu,0.0,maxY); 00441 delta_steps = soft_slope_limit(target, pos_c, left_side, mu); 00442 } 00443 else PLERROR("ConditionalDensityNet::build, steps_type option value unknown: %s",steps_type.c_str()); 00444 00445 density_numerator = dot(pos_b,steps_gradient); 00446 cum_denominator = dot(pos_b,positive(steps_M-steps_0)); 00447 inverse_denominator = 1.0/cum_denominator; 00448 cum_numerator = dot(pos_b,(steps-steps_0)); 00449 cumulative = pos_a + (1-pos_a) * cum_numerator * inverse_denominator; 00450 density = density_numerator * inverse_denominator; // this is the conditional density for Y>0 00451 // apply l'hopital rule if pos_c --> 0 to avoid blow-up (N.B. lim_{pos_c->0} pos_b/pos_c*steps_integral = pos_b*delta_steps) 00452 lhopital = ifThenElse(isAboveThreshold(pos_c,1e-20),steps_integral,delta_steps); 00453 expected_value = max_y - ((pos_a-(1-pos_a)*inverse_denominator*dot(pos_b,steps_0))*max_y + 00454 (1-pos_a)*dot(pos_b,lhopital)*inverse_denominator); 00455 mass_cost = -log(ifThenElse(isAboveThreshold(target,0.0,1,0,true),(1-pos_a),pos_a)); 00456 pos_y_cost = ifThenElse(isAboveThreshold(target,0.0,1,0,true),-log(density),var(0.0)); 00457 nll = -log(ifThenElse(isAboveThreshold(target,0.0,1,0,true),density*(1-pos_a),pos_a)); 00458 } 00459 else 00460 { 00461 pos_a = var(0.0); 00462 if (steps_type=="sigmoid_steps") 00463 { 00464 steps = sigmoid(scaled_centers); 00465 // steps evaluated at target = M 00466 steps_M = sigmoid(scaled_centers_M); 00467 steps_0 = sigmoid(scaled_centers_0); 00468 // derivative of steps wrt target 00469 steps_gradient = pos_c*steps*(1-steps); 00470 steps_integral = (softplus(scaled_centers_M) - softplus(scaled_centers_0))/pos_c; 00471 delta_steps = centers_M*steps_M + mu*sigmoid(scaled_centers_0); 00472 } 00473 else if (steps_type=="sloped_steps") 00474 { 00475 steps = soft_slope(target, pos_c, left_side, mu); 00476 steps_M = soft_slope(max_y, pos_c, left_side, mu); 00477 steps_0 = soft_slope(var(0.0), pos_c, left_side, mu); 00478 steps_gradient = d_soft_slope(target, pos_c, left_side, mu); 00479 steps_integral = soft_slope_integral(pos_c,left_side,mu,0.0,maxY); 00480 delta_steps = soft_slope_limit(target, pos_c, left_side, mu); 00481 } 00482 else PLERROR("ConditionalDensityNet::build, steps_type option value unknown: %s",steps_type.c_str()); 00483 00484 density_numerator = dot(pos_b,steps_gradient); 00485 cum_denominator = dot(pos_b,steps_M - steps_0); 00486 inverse_denominator = 1.0/cum_denominator; 00487 cum_numerator = dot(pos_b,steps - steps_0); 00488 cumulative = cum_numerator * inverse_denominator; 00489 density = density_numerator * inverse_denominator; 00490 // apply l'hopital rule if pos_c --> 0 to avoid blow-up (N.B. lim_{pos_c->0} pos_b/pos_c*steps_integral = pos_b*delta_steps) 00491 lhopital = ifThenElse(isAboveThreshold(pos_c,1e-20),steps_integral,delta_steps); 00492 expected_value = dot(pos_b,lhopital)*inverse_denominator; 00493 nll = -log(ifThenElse(isAboveThreshold(target,0.0,1,0,true),density,cumulative)); 00494 } 00495 max_y->setName("maxY"); 00496 left_side->setName("left_side"); 00497 pos_a->setName("pos_a"); 00498 pos_b->setName("pos_b"); 00499 pos_c->setName("pos_c"); 00500 steps->setName("steps"); 00501 steps_M->setName("steps_M"); 00502 steps_integral->setName("steps_integral"); 00503 expected_value->setName("expected_value"); 00504 density_numerator->setName("density_numerator"); 00505 cum_denominator->setName("cum_denominator"); 00506 inverse_denominator->setName("inverse_denominator"); 00507 cum_numerator->setName("cum_numerator"); 00508 cumulative->setName("cumulative"); 00509 density->setName("density"); 00510 lhopital->setName("lhopital"); 00511 00512 /* 00513 * cost functions: 00514 * training_criterion = log_likelihood_vs_squared_error_balance*neg_log_lik 00515 * +(1-log_likelihood_vs_squared_error_balance)*squared_err 00516 * +penalties 00517 * neg_log_lik = -log(1_{target=0} cumulative + 1_{target>0} density) 00518 * squared_err = square(target - expected_value) 00519 */ 00520 costs.resize(3); 00521 00522 costs[1] = nll; 00523 costs[2] = square(target-expected_value); 00524 // for debugging gradient computation error 00525 if (log_likelihood_vs_squared_error_balance==1) 00526 costs[0] = costs[1]; 00527 else if (log_likelihood_vs_squared_error_balance==0) 00528 costs[0] = costs[2]; 00529 else costs[0] = log_likelihood_vs_squared_error_balance*costs[1]+ 00530 (1-log_likelihood_vs_squared_error_balance)*costs[2]; 00531 if (c_penalization > 0) { 00532 costs[0] = costs[0] + c_penalization * sumsquare(c); 00533 } 00534 00535 // for debugging 00536 //costs[0] = mass_cost + pos_y_cost; 00537 //costs[1] = mass_cost; 00538 //costs[2] = pos_y_cost; 00539 00540 /* 00541 * weight and bias decay penalty 00542 */ 00543 00544 // create penalties 00545 penalties.resize(0); // prevents penalties from being added twice by consecutive builds 00546 if(w1 && ((layer1_weight_decay + weight_decay)!=0 || (layer1_bias_decay + bias_decay)!=0)) 00547 penalties.append(affine_transform_weight_penalty(w1, (layer1_weight_decay + weight_decay), (layer1_bias_decay + bias_decay), L1_penalty)); 00548 if(w2 && ((layer2_weight_decay + weight_decay)!=0 || (layer2_bias_decay + bias_decay)!=0)) 00549 penalties.append(affine_transform_weight_penalty(w2, (layer2_weight_decay + weight_decay), (layer2_bias_decay + bias_decay), L1_penalty)); 00550 if(wout && ((output_layer_weight_decay + weight_decay)!=0 || (output_layer_bias_decay + bias_decay)!=0)) 00551 penalties.append(affine_transform_weight_penalty(wout, (output_layer_weight_decay + weight_decay), 00552 (output_layer_bias_decay + bias_decay), L1_penalty)); 00553 if(wdirect && (direct_in_to_out_weight_decay + weight_decay) != 0) 00554 { 00555 if (L1_penalty) 00556 penalties.append(sumabs(wdirect)*(direct_in_to_out_weight_decay + weight_decay)); 00557 else 00558 penalties.append(sumsquare(wdirect)*(direct_in_to_out_weight_decay + weight_decay)); 00559 } 00560 00561 test_costs = hconcat(costs); 00562 00563 // apply penalty to cost 00564 if(penalties.size() != 0) { 00565 // only multiply by sampleweight if there are weights 00566 if (weightsize_>0) 00567 training_cost = hconcat(sampleweight*sum(hconcat(costs[0] & penalties)) 00568 & (test_costs*sampleweight)); 00569 else { 00570 training_cost = hconcat(sum(hconcat(costs[0] & penalties)) & test_costs); 00571 } 00572 } 00573 else { 00574 // only multiply by sampleweight if there are weights 00575 if(weightsize_>0) { 00576 training_cost = test_costs*sampleweight; 00577 } else { 00578 training_cost = test_costs; 00579 } 00580 } 00581 00582 training_cost->setName("training_cost"); 00583 test_costs->setName("test_costs"); 00584 output->setName("output"); 00585 00586 // Shared values hack... 00587 bool use_paramsvalues=(bool)paramsvalues && (paramsvalues.size() == params.nelems()); 00588 if(use_paramsvalues) 00589 { 00590 params << paramsvalues; 00591 initialize_mu(mu->value); 00592 } 00593 else 00594 { 00595 paramsvalues.resize(params.nelems()); 00596 initializeParams(); 00597 } 00598 params.makeSharedValue(paramsvalues); 00599 00600 VarArray output_and_target = output & target; 00601 output_and_target_values.resize(output.length()+target.length()); 00602 output_and_target.makeSharedValue(output_and_target_values); 00603 00604 cdf_f = Func(output_and_target,cumulative); 00605 mean_f = Func(output,expected_value); 00606 density_f = Func(output_and_target,density); 00607 00608 // Funcs 00609 VarArray outvars; 00610 VarArray testinvars; 00611 invars.resize(0); 00612 if(input) 00613 { 00614 invars.push_back(input); 00615 testinvars.push_back(input); 00616 } 00617 if(expected_value) 00618 { 00619 outvars.push_back(expected_value); 00620 } 00621 if(target) 00622 { 00623 invars.push_back(target); 00624 testinvars.push_back(target); 00625 outvars.push_back(target); 00626 } 00627 if(sampleweight) 00628 { 00629 invars.push_back(sampleweight); 00630 } 00631 00632 VarArray outputs_array; 00633 00634 for (unsigned int i=0;i<outputs_def.length();i++) 00635 { 00636 if (outputs_def[i]=='e') 00637 outputs_array &= expected_value; 00638 else if (outputs_def[i]=='t') 00639 { 00640 Func survival_f(target&output,var(1.0)-cumulative); 00641 Var threshold_y(1,1); 00642 threshold_y->valuedata[0]=thresholdY; 00643 outputs_array &= survival_f(threshold_y & output); 00644 } 00645 else if (outputs_def[i]=='S' || outputs_def[i]=='C' || 00646 outputs_def[i]=='L' || outputs_def[i]=='D') 00647 { 00648 Func prob_f(target&output,outputs_def[i]=='S'?(var(1.0)-cumulative): 00649 (outputs_def[i]=='C'?cumulative: 00650 (outputs_def[i]=='D'?density:log(density)))); 00651 y_values.resize(n_curve_points); 00652 if (curve_positions=="uniform") 00653 { 00654 real delta = maxY/(n_curve_points-1); 00655 for (int i=0;i<n_curve_points;i++) 00656 { 00657 y_values[i] = var(i*delta); 00658 y_values[i]->setName("y"+tostring(i)); 00659 outputs_array &= prob_f(y_values[i] & output); 00660 } 00661 } else // log-scale 00662 { 00663 real denom = 1.0/(1-exp(-scale)); 00664 for (int i=0;i<n_curve_points;i++) 00665 { 00666 y_values[i] = var((exp(scale*(i-n_output_density_terms)/n_output_density_terms)-exp(-scale))*denom); 00667 y_values[i]->setName("y"+tostring(i)); 00668 outputs_array &= prob_f(y_values[i] & output); 00669 } 00670 } 00671 } else 00672 outputs_array &= expected_value; 00673 // PLERROR("ConditionalDensityNet::build: can't handle outputs_def with option value = %c",outputs_def[i]); 00674 } 00675 outputs = hconcat(outputs_array); 00676 if (mu_is_fixed) 00677 f = Func(input, params&mu, outputs); 00678 else 00679 f = Func(input, params, outputs); 00680 f->recomputeParents(); 00681 00682 in2distr_f = Func(input,pos_a); 00683 in2distr_f->recomputeParents(); 00684 00685 if (mu_is_fixed) 00686 test_costf = Func(testinvars, params&mu, outputs&test_costs); 00687 else 00688 test_costf = Func(testinvars, params, outputs&test_costs); 00689 00690 if (use_paramsvalues) 00691 test_costf->recomputeParents(); 00692 } 00693 PDistribution::finishConditionalBuild(); 00694 } 00695 00696 00697 /* TODO Remove (?) 00698 void ConditionalDensityNet::computeOutput(const Vec& inputv, Vec& outputv) const 00699 { 00700 f->fprop(inputv,outputv); 00701 } 00702 */ 00703 00704 /* TODO Remove (?) 00705 void ConditionalDensityNet::computeOutputAndCosts(const Vec& inputv, const Vec& targetv, 00706 Vec& outputv, Vec& costsv) const 00707 { 00708 test_costf->fprop(inputv&targetv, outputv&costsv); 00709 } 00710 */ 00711 00712 TVec<string> ConditionalDensityNet::getTrainCostNames() const 00713 { 00714 if (penalties.size() > 0) 00715 { 00716 TVec<string> cost_funcs(4); 00717 cost_funcs[0]="training_criterion+penalty"; 00718 cost_funcs[1]="training_criterion"; 00719 cost_funcs[2]="NLL"; 00720 cost_funcs[3]="mse"; 00721 return cost_funcs; 00722 } 00723 else return getTestCostNames(); 00724 } 00725 00726 /* 00727 TVec<string> ConditionalDensityNet::getTestCostNames() const 00728 { 00729 TVec<string> cost_funcs(3); 00730 cost_funcs[0]="training_criterion"; 00731 cost_funcs[1]="NLL"; 00732 cost_funcs[2]="mse"; 00733 return cost_funcs; 00734 } 00735 */ 00736 00737 // ### Nothing to add here, simply calls build_ 00738 void ConditionalDensityNet::build() 00739 { 00740 inherited::build(); 00741 build_(); 00742 } 00743 00744 extern void varDeepCopyField(Var& field, CopiesMap& copies); 00745 00746 void ConditionalDensityNet::makeDeepCopyFromShallowCopy(map<const void*, void*>& copies) 00747 { 00748 inherited::makeDeepCopyFromShallowCopy(copies); 00749 varDeepCopyField(input, copies); 00750 varDeepCopyField(target, copies); 00751 varDeepCopyField(sampleweight, copies); 00752 varDeepCopyField(w1, copies); 00753 varDeepCopyField(w2, copies); 00754 varDeepCopyField(wout, copies); 00755 varDeepCopyField(wdirect, copies); 00756 varDeepCopyField(output, copies); 00757 varDeepCopyField(outputs, copies); 00758 varDeepCopyField(a, copies); 00759 varDeepCopyField(pos_a, copies); 00760 varDeepCopyField(b, copies); 00761 varDeepCopyField(pos_b, copies); 00762 varDeepCopyField(c, copies); 00763 varDeepCopyField(pos_c, copies); 00764 varDeepCopyField(density, copies); 00765 varDeepCopyField(cumulative, copies); 00766 varDeepCopyField(expected_value, copies); 00767 deepCopyField(costs, copies); 00768 deepCopyField(penalties, copies); 00769 varDeepCopyField(training_cost, copies); 00770 varDeepCopyField(test_costs, copies); 00771 deepCopyField(invars, copies); 00772 deepCopyField(params, copies); 00773 deepCopyField(paramsvalues, copies); 00774 varDeepCopyField(centers, copies); 00775 varDeepCopyField(centers_M, copies); 00776 varDeepCopyField(steps, copies); 00777 varDeepCopyField(steps_M, copies); 00778 varDeepCopyField(steps_0, copies); 00779 varDeepCopyField(steps_gradient, copies); 00780 varDeepCopyField(steps_integral, copies); 00781 varDeepCopyField(delta_steps, copies); 00782 varDeepCopyField(cum_numerator, copies); 00783 varDeepCopyField(cum_denominator, copies); 00784 deepCopyField(unconditional_cdf, copies); 00785 varDeepCopyField(unconditional_delta_cdf, copies); 00786 varDeepCopyField(initial_hardnesses, copies); 00787 varDeepCopyField(prev_centers, copies); 00788 varDeepCopyField(prev_centers_M, copies); 00789 varDeepCopyField(scaled_prev_centers, copies); 00790 varDeepCopyField(scaled_prev_centers_M, copies); 00791 varDeepCopyField(minus_prev_centers_0, copies); 00792 varDeepCopyField(minus_scaled_prev_centers_0, copies); 00793 deepCopyField(y_values, copies); 00794 varDeepCopyField(mu, copies); 00795 deepCopyField(f, copies); 00796 deepCopyField(test_costf, copies); 00797 deepCopyField(output_and_target_to_cost, copies); 00798 deepCopyField(cdf_f, copies); 00799 deepCopyField(mean_f, copies); 00800 deepCopyField(density_f, copies); 00801 deepCopyField(in2distr_f, copies); 00802 deepCopyField(output_and_target, copies); 00803 deepCopyField(output_and_target_values, copies); 00804 varDeepCopyField(totalcost, copies); 00805 varDeepCopyField(mass_cost, copies); 00806 varDeepCopyField(pos_y_cost, copies); 00807 deepCopyField(optimizer, copies); 00808 } 00809 00810 void ConditionalDensityNet::setInput(const Vec& in) const 00811 { 00812 #ifdef BOUNDCHECK 00813 if (!f) 00814 PLERROR("ConditionalDensityNet:setInput: build was not completed (maybe because training set was not provided)!"); 00815 #endif 00816 in2distr_f->fprop(in,pos_a->value); 00817 } 00818 00819 real ConditionalDensityNet::log_density(const Vec& y) const 00820 { 00821 static Vec d; 00822 d.resize(1); 00823 target->value << y; 00824 density_f->fprop(output_and_target_values, d); 00825 return log(d[0]); 00826 } 00827 00828 real ConditionalDensityNet::survival_fn(const Vec& y) const 00829 { 00830 return 1 - cdf(y); 00831 } 00832 00833 // must be called after setInput 00834 real ConditionalDensityNet::cdf(const Vec& y) const 00835 { 00836 Vec cum(1); 00837 target->value << y; 00838 cdf_f->fprop(output_and_target_values,cum); 00839 #ifdef BOUNDCHECK 00840 if (cum[0] < -1e-3) 00841 PLERROR("In ConditionalDensityNet::cdf - The cdf is < 0"); 00842 #endif 00843 return cum[0]; 00844 } 00845 00846 void ConditionalDensityNet::expectation(Vec& mu) const 00847 { 00848 mu.resize(n_target); 00849 mean_f->fprop(output->value,mu); 00850 } 00851 00852 void ConditionalDensityNet::variance(Mat& covar) const 00853 { 00854 PLERROR("variance not implemented for ConditionalDensityNet"); 00855 } 00856 00857 void ConditionalDensityNet::resetGenerator(long g_seed) const 00858 { 00859 manual_seed(g_seed); 00860 } 00861 00862 void ConditionalDensityNet::generate(Vec& y) const 00863 { 00864 real u = uniform_sample(); 00865 y.resize(1); 00866 if (u<pos_a->value[0]) // mass point 00867 { 00868 y[0]=0; 00869 return; 00870 } 00871 // then find y s.t. P(Y<y|x) = u by binary search 00872 real y0=0; 00873 real y2=maxY; 00874 real delta; 00875 real p; 00876 do 00877 { 00878 delta = y2 - y0; 00879 y[0] = y0 + delta*0.5; 00880 p = cdf(y); 00881 if (p<u) 00882 // increase y 00883 y0 = y[0]; 00884 else 00885 // decrease y 00886 y2 = y[0]; 00887 } 00888 while (delta > generate_precision * maxY); 00889 } 00890 00891 00892 // Default version of inputsize returns learner->inputsize() 00893 // If this is not appropriate, you should uncomment this and define 00894 // it properly in the .cc 00895 // int ConditionalDensityNet::inputsize() const {} 00896 00897 void ConditionalDensityNet::initializeParams() 00898 { 00899 if (seed_>=0) 00900 manual_seed(seed_); 00901 else 00902 PLearn::seed(); 00903 00904 //real delta = 1./sqrt(inputsize()); 00905 real delta = 1.0 / n_input; 00906 /* 00907 if(direct_in_to_out) 00908 { 00909 //fill_random_uniform(wdirect->value, -delta, +delta); 00910 fill_random_normal(wdirect->value, 0, delta); 00911 //wdirect->matValue(0).clear(); 00912 } 00913 */ 00914 if(nhidden>0) 00915 { 00916 //fill_random_uniform(w1->value, -delta, +delta); 00917 //delta = 1./sqrt(nhidden); 00918 fill_random_normal(w1->value, 0, delta); 00919 if(direct_in_to_out) 00920 { 00921 //fill_random_uniform(wdirect->value, -delta, +delta); 00922 fill_random_normal(wdirect->value, 0, 0.01*delta); 00923 wdirect->matValue(0).clear(); 00924 } 00925 delta = 1.0/nhidden; 00926 w1->matValue(0).clear(); 00927 } 00928 if(nhidden2>0) 00929 { 00930 //fill_random_uniform(w2->value, -delta, +delta); 00931 //delta = 1./sqrt(nhidden2); 00932 delta = 0.1/nhidden2; 00933 fill_random_normal(w2->value, 0, delta); 00934 w2->matValue(0).clear(); 00935 } 00936 //fill_random_uniform(wout->value, -delta, +delta); 00937 fill_random_normal(wout->value, 0, delta); 00938 Mat a_weights = wout->matValue.column(0); 00939 // a_weights *= 3.0; // to get more dynamic range 00940 00941 if (centers_initialization!="data") 00942 { 00943 Vec output_biases = wout->matValue(0); 00944 Vec mu_; 00945 int i=0; 00946 Vec a_ = output_biases.subVec(i++,1); 00947 Vec b_ = output_biases.subVec(i,n_output_density_terms); i+=n_output_density_terms; 00948 Vec c_ = output_biases.subVec(i,n_output_density_terms); i+=n_output_density_terms; 00949 if (mu_is_fixed) 00950 mu_ = mu->value; 00951 else 00952 mu_ = output_biases.subVec(i,n_output_density_terms); i+=n_output_density_terms; 00953 initialize_mu(mu_); 00954 b_.fill(inverse_softplus(1.0)); 00955 c_.fill(inverse_softplus(1.0)); 00956 if (separate_mass_point) 00957 a_[0] = unconditional_p0>0?inverse_sigmoid(unconditional_p0):-50; 00958 else a_[0] = -50; 00959 unconditional_delta_cdf->value.fill((1.0-unconditional_p0)/n_output_density_terms); 00960 } 00961 00962 // Reset optimizer 00963 if(optimizer) 00964 optimizer->reset(); 00965 } 00966 00967 void ConditionalDensityNet::initialize_mu(Vec& mu_) 00968 { 00969 if (centers_initialization=="uniform") 00970 { 00971 real delta=maxY/n_output_density_terms; 00972 real center=delta; 00973 for (int i=0;i<n_output_density_terms;i++,center+=delta) 00974 mu_[i]=center; 00975 } else if (centers_initialization=="log-scale") 00976 { 00977 real denom = 1.0/(1-exp(-scale)); 00978 for (int i=0;i<n_output_density_terms;i++) 00979 mu_[i]=(exp(scale*(i+1-n_output_density_terms)/n_output_density_terms)-exp(-scale))*denom; 00980 } else if (centers_initialization!="data") 00981 PLERROR("ConditionalDensityNet::initialize_mu: unknown value %s for centers_initialization option", 00982 centers_initialization.c_str()); 00983 } 00984 00986 void ConditionalDensityNet::forget() 00987 { 00988 if (train_set) initializeParams(); 00989 stage = 0; 00990 } 00991 00993 void ConditionalDensityNet::train() 00994 { 00995 int i=0, j=0; 00996 if(!train_set) 00997 PLERROR("In ConditionalDensityNet::train, you did not setTrainingSet"); 00998 00999 if(!train_stats) 01000 PLERROR("In ConditionalDensityNet::train, you did not setTrainStatsCollector"); 01001 01002 if (!already_sorted || n_margin > 0) 01003 PLERROR("In ConditionalDensityNet::train - Currently, can only be trained if the data is given as input, target"); 01004 01005 if(f.isNull()) // Net has not been properly built yet (because build was called before the learner had a proper training set) 01006 build(); 01007 01008 int l = train_set->length(); 01009 int nsamples = batch_size>0 ? batch_size : l; 01010 Func paramf = Func(invars, training_cost); // parameterized function to optimize 01011 Var totalcost = meanOf(train_set, paramf, nsamples); 01012 if(optimizer) 01013 { 01014 optimizer->setToOptimize(params, totalcost); 01015 optimizer->build(); 01016 } 01017 01018 // number of optimiser stages corresponding to one learner stage (one epoch) 01019 int optstage_per_lstage = l/nsamples; 01020 01021 ProgressBar* pb = 0; 01022 if(report_progress) 01023 pb = new ProgressBar("Training ConditionalDensityNet from stage " + tostring(stage) + " to " + tostring(nstages), nstages-stage); 01024 01025 // estimate the unconditional cdf 01026 static real weight; 01027 01028 if (stage==0) 01029 { 01030 Vec mu_values = mu->value; 01031 unconditional_cdf.clear(); 01032 real sum_w=0; 01033 unconditional_p0 = 0; 01034 static StatsCollector sc; 01035 bool init_mu_from_data=centers_initialization=="data"; 01036 if (init_mu_from_data) 01037 { 01038 sc.maxnvalues = min(l,100*n_output_density_terms); 01039 sc.build(); 01040 sc.forget(); 01041 } 01042 Vec tmp1(inputsize()); 01043 Vec tmp2(targetsize()); 01044 for (i=0;i<l;i++) 01045 { 01046 train_set->getExample(i, tmp1, tmp2, weight); 01047 input->value << tmp1.subVec(0, n_input); 01048 target->value << tmp1.subVec(n_input, n_target); 01049 real y = target->valuedata[0]; 01050 if (y < 0) 01051 PLERROR("In ConditionalDensityNet::train - Found a negative target"); 01052 if (y > maxY) 01053 PLERROR("In ConditionalDensityNet::train - Found a target > maxY"); 01054 if (y == 0) 01055 unconditional_p0 += weight; 01056 if (init_mu_from_data) 01057 sc.update(y,weight); 01058 else 01059 for (int j=0;j<n_output_density_terms;j++) 01060 if (y<=mu_values[j]) 01061 unconditional_cdf[j] += weight; 01062 sum_w += weight; 01063 } 01064 static Mat cdf; 01065 unconditional_p0 *= 1.0/sum_w; 01066 if (init_mu_from_data) 01067 { 01068 cdf = sc.cdf(); 01069 int k=3; 01070 real mean_y = sc.mean(); 01071 01072 real current_mean_fraction = 0; 01073 real prev_cdf = unconditional_p0; 01074 real prev_y = 0; 01075 for (int j=0;j<n_output_density_terms;j++) 01076 { 01077 real target_fraction = mean_y*(j+1.0)/n_output_density_terms; 01078 for (;k<cdf.length() && current_mean_fraction < target_fraction;k++) 01079 { 01080 current_mean_fraction += (cdf(k,0)+prev_y)*0.5*(cdf(k,1)-prev_cdf); 01081 prev_cdf = cdf(k,1); 01082 prev_y = cdf(k,0); 01083 } 01084 if (j==n_output_density_terms-1) 01085 { 01086 mu_values[j]=maxY; 01087 unconditional_cdf[j]=1.0; 01088 } 01089 else 01090 { 01091 mu_values[j]=cdf(k,0); 01092 unconditional_cdf[j]=cdf(k,1); 01093 } 01094 } 01095 } 01096 else 01097 for (j=0;j<n_output_density_terms;j++) 01098 unconditional_cdf[j] *= 1.0/sum_w; 01099 01100 unconditional_delta_cdf->valuedata[0]=unconditional_cdf[0]-unconditional_p0; 01101 for (i=1;i<n_output_density_terms;i++) 01102 unconditional_delta_cdf->valuedata[i]=unconditional_cdf[i]-unconditional_cdf[i-1]; 01103 01104 // initialize biases based on unconditional distribution 01105 Vec output_biases = wout->matValue(0); 01106 i=0; 01107 Vec a_ = output_biases.subVec(i++,1); 01108 Vec b_ = output_biases.subVec(i,n_output_density_terms); i+=n_output_density_terms; 01109 Vec c_ = output_biases.subVec(i,n_output_density_terms); i+=n_output_density_terms; 01110 Vec mu_; 01111 Vec s_c(n_output_density_terms); 01112 if (mu_is_fixed) 01113 mu_ = mu->value; 01114 else 01115 mu_ = output_biases.subVec(i,n_output_density_terms); i+=n_output_density_terms; 01116 b_.fill(inverse_softplus(1.0)); 01117 initialize_mu(mu_); 01118 for (i=0;i<n_output_density_terms;i++) 01119 { 01120 real prev_mu = i==0?0:mu_[i-1]; 01121 real delta = mu_[i]-prev_mu; 01122 s_c[i] = delta>0?initial_hardness/delta:-50; 01123 c_[i] = inverse_softplus(1.0); 01124 } 01125 01126 if (centers_initialization!="data") 01127 unconditional_delta_cdf->value.fill(1.0/n_output_density_terms); 01128 real *dcdf = unconditional_delta_cdf->valuedata; 01129 if (separate_mass_point) 01130 a_[0] = unconditional_p0>0?inverse_sigmoid(unconditional_p0):-50; 01131 else if (dcdf[0]==0) 01132 a_[0]=unconditional_p0>0?inverse_softplus(unconditional_p0):-50; 01133 else 01134 { 01135 real s=0; 01136 if (steps_type=="sigmoid_steps") 01137 for (i=0;i<n_output_density_terms;i++) 01138 s+=dcdf[i]*(unconditional_p0*sigmoid(s_c[i]*(maxY-mu_[i]))-sigmoid(-s_c[i]*mu_[i])); 01139 else 01140 for (i=0;i<n_output_density_terms;i++) 01141 { 01142 real prev_mu = i==0?0:mu_[i-1]; 01143 real ss1 = soft_slope(maxY,s_c[i],prev_mu,mu_[i]); 01144 real ss2 = soft_slope(0,s_c[i],prev_mu,mu_[i]); 01145 s+=dcdf[i]*(unconditional_p0*ss1 - ss2); 01146 } 01147 real sa=s/(1-unconditional_p0); 01148 a_[0]=sa>0?inverse_softplus(sa):-50; 01149 01150 /* 01151 Mat At(n_output_density_terms,n_output_density_terms); // transpose of the linear system matrix 01152 Mat rhs(1,n_output_density_terms); // right hand side of the linear system 01153 // solve the system to find b's that make the unconditional fit the observed data 01154 // sum_j sb_j dcdf_j (cdf_j step_j(maxY) - step_j(mu_i)) = sa (1 - cdf_i) 01155 // 01156 for (int i=0;i<n_output_density_terms;i++) 01157 { 01158 real* Ati = At[i]; 01159 real prev_mu = i==0?0:mu_[i-1]; 01160 for (int j=0;j<n_output_density_terms;j++) 01161 { 01162 if (steps_type=="sigmoid_steps") 01163 Ati[j] = dcdf[i]*(unconditional_cdf[j]*sigmoid(initial_hardness*(maxY-mu_[i]))- 01164 sigmoid(initial_hardness*(mu_[j]-mu_[i]))); 01165 else 01166 Ati[j] = dcdf[i]*(unconditional_cdf[j]*soft_slope(maxY,initial_hardness,prev_mu,mu_[i])- 01167 soft_slope(mu_[j],initial_hardness,prev_mu,mu_[i])); 01168 } 01169 rhs[0][i] = sa*(1-unconditional_cdf[i]); 01170 } 01171 TVec<int> pivots(n_output_density_terms); 01172 int status = lapackSolveLinearSystem(At,rhs,pivots); 01173 if (status==0) 01174 for (int i=0;i<n_output_density_terms;i++) 01175 b_[i] = inverse_softplus(rhs[0][i]); 01176 else 01177 PLWARNING("ConditionalDensityNet::initializeParams() Could not invert matrix to obtain exact init. of b"); 01178 */ 01179 } 01180 test_costf->recomputeParents(); 01181 01182 // debugging 01183 static bool display_graph = false; 01184 if (display_graph) f->fprop(input->value,outputs->value); 01185 //displayVarGraph(outputs,true); 01186 if (display_graph) 01187 displayFunction(f,true); 01188 if (display_graph) 01189 displayFunction(test_costf,true); 01190 } 01191 int initial_stage = stage; 01192 bool early_stop=false; 01193 while(stage<nstages && !early_stop) 01194 { 01195 optimizer->nstages = optstage_per_lstage; 01196 train_stats->forget(); 01197 optimizer->early_stop = false; 01198 early_stop = optimizer->optimizeN(*train_stats); 01199 01200 //if (verify_gradient) 01201 // training_cost->verifyGradient(verify_gradient); 01202 //if (stage==nstages-1 && verify_gradient) 01203 static real verify_gradient = 0; 01204 if (verify_gradient) 01205 { 01206 if (batch_size == 0) 01207 { 01208 cout << "OPTIMIZER" << endl; 01209 optimizer->verifyGradient(0.001); 01210 } 01211 } 01212 static bool display_graph = false; 01213 if (display_graph) 01214 displayFunction(f,true); 01215 if (display_graph) 01216 displayFunction(test_costf,true); 01217 01218 train_stats->finalize(); 01219 if(verbosity>2) 01220 cerr << "Epoch " << stage << " train objective: " << train_stats->getMean() << endl; 01221 ++stage; 01222 if(pb) 01223 pb->update(stage-initial_stage); 01224 } 01225 if(verbosity>1) 01226 cerr << "EPOCH " << stage << " train objective: " << train_stats->getMean() << endl; 01227 01228 if(pb) 01229 delete pb; 01230 01231 test_costf->recomputeParents(); 01232 } 01233 01234 } // end of namespace PLearn

Generated on Tue Aug 17 15:50:09 2004 for PLearn by doxygen 1.3.7