00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00043
#include <plearn/var/AffineTransformVariable.h>
00044
#include <plearn/var/AffineTransformWeightPenalty.h>
00045
#include "ConditionalDensityNet.h"
00046
#include <plearn/var/ConcatColumnsVariable.h>
00047
#include <plearn/var/ConcatRowsVariable.h>
00048
#include <plearn/var/CutBelowThresholdVariable.h>
00049
#include <plearn/display/DisplayUtils.h>
00050
#include <plearn/var/DotProductVariable.h>
00051
#include <plearn/var/IfThenElseVariable.h>
00052
#include <plearn/var/IsAboveThresholdVariable.h>
00053
#include <plearn/var/LogVariable.h>
00054
00055
#include <plearn/var/SoftSlopeVariable.h>
00056
#include <plearn/var/SoftSlopeIntegralVariable.h>
00057
00058
#include <plearn/var/SoftplusVariable.h>
00059
#include <plearn/var/SubMatTransposeVariable.h>
00060
#include <plearn/var/SubMatVariable.h>
00061
#include <plearn/var/SumAbsVariable.h>
00062
#include <plearn/var/SumOfVariable.h>
00063
#include <plearn/var/SumSquareVariable.h>
00064
#include <plearn/var/SumVariable.h>
00065
#include <plearn/var/TanhVariable.h>
00066
#include <plearn/var/TransposeProductVariable.h>
00067
#include <plearn/math/random.h>
00068
00069
namespace PLearn {
00070
using namespace std;
00071
00072 ConditionalDensityNet::ConditionalDensityNet()
00073 : nhidden(0),
00074 nhidden2(0),
00075 weight_decay(0),
00076 bias_decay(1e-6),
00077 layer1_weight_decay(0),
00078 layer1_bias_decay(0),
00079 layer2_weight_decay(0),
00080 layer2_bias_decay(0),
00081 output_layer_weight_decay(0),
00082 output_layer_bias_decay(0),
00083 direct_in_to_out_weight_decay(0),
00084 L1_penalty(false),
00085 direct_in_to_out(false),
00086 batch_size(1),
00087 c_penalization(0),
00088 maxY(1),
00089 thresholdY(0.1),
00090 log_likelihood_vs_squared_error_balance(1),
00091 separate_mass_point(1),
00092 n_output_density_terms(0),
00093 generate_precision(1e-3),
00094 steps_type("sloped_steps"),
00095 centers_initialization("data"),
00096 curve_positions("uniform"),
00097 scale(5.0),
00098 unconditional_p0(0.01),
00099 mu_is_fixed(true),
00100 initial_hardness(1)
00101 {}
00102
00103
PLEARN_IMPLEMENT_OBJECT(
ConditionalDensityNet,
"Neural Network that Implements a Positive Random Variable Conditional Density",
00104
"The input vector is used to compute parameters of an output density or output\n"
00105
"cumulative distribution as well as output expected value. The ASSUMPTIONS\n"
00106
"on the generating distribution P(Y|X) are the following:\n"
00107
" * Y is a single real value\n"
00108
" * 0 <= Y <= maxY, with maxY a known finite value\n"
00109
" * the density has a mass point at Y=0\n"
00110
" * the density is continuous for Y>0\n"
00111
"The form of the conditional cumulative of Y is the following (separate_mass_points=false):\n"
00112
" P(Y<=y|theta) = (1/Z) (s(a) + sum_i u_i s(b_i) g(y,theta,i))\n"
00113
"or for separate_mass_point=true:\n"
00114
" P(Y<=y|theta) = sigmoid(a) + (1-sigmoid(a))(sum_i u_i s(b_i) (g(y,theta,i)-g(0,theta,i))/Z\n"
00115
"where s(z)=log(1+exp(z)) is the softplus function, and g is a monotonic function\n"
00116
"in y whose first derivative and indefinite integral are known analytically.\n"
00117
"The u_i are fixed from the unconditional distribution, such that s(b_i)=1 gives\n"
00118
"approximately the right unconditional cumulative function (for infinite hardness):\n"
00119
" u_i = P(mu_{i-1}<Y<=mu_i) [unconditional].\n"
00120
"The parameters theta of Y's distribution are (a,b_1,b_2,...,c_1,c_2,...,mu_1,mu_2,...),\n"
00121
"which are obtained as the unconstrained outputs (no output transfer function) of a neural network.\n"
00122
"The normalization constant Z is computed analytically easily: (separate_mass_point=false)\n"
00123
" Z = s(a) + sum_i u_i s(b_i) g(y,theta,i)\n"
00124
"or for separate_mass_point=true:\n"
00125
" Z = sum_i s(b_i) (g(y,theta,i)-g(0,theta,i))\n"
00126
"The current implementation considers two choices for g:\n"
00127
" - sigmoid_steps: g(y,theta,i) = sigmoid(h*s(c_i)*(y-mu_i)/(mu_{i+1}-mu_i))\n"
00128
" - sloped_steps: g(y,theta,i) = 1 + s(s(c_i)*(mu_i-y))-s(s(c_i)*(mu_{i+1}-y))/(s(c_i)*(mu_{i+1}-mu_i))\n"
00129
"where h is the 'initial_hardness' option.\n"
00130
"The density is analytically obtained using the derivative g' of g and\n"
00131
"expected value is analytically obtained using the primitive G of g.\n"
00132
"For the mass point at the origin,\n"
00133
" P(Y=0|theta) = P(Y<=0|theta).\n"
00134
"(which is simply sigmoid(a) if separate_mass_point).\n"
00135
"For positive values of Y: (separate_mass_point=false)\n"
00136
" p(y|theta) = (1/Z) sum_i s(b_i) g'(y,theta,i).\n"
00137
"or for separate_mass_point=true:\n"
00138
" p(y|theta) = (1-sigmoid(a)) (1/Z) sum_i s(b_i) g'(y,theta,i).\n"
00139
"And the expected value of Y is obtained using the primitive: (separate_mass_point=false)\n"
00140
" E[Y|theta] = (1/Z)*s(a)*M + sum_i u_i s(b_i)(G(M,theta,i)-G(0,theta,i)))\n"
00141
"or for separate_mass_point=true:\n"
00142
" E[Y|theta] = M - ((sigmoid(a)-(1-sigmoid(a)*(1/Z)*sum_i u_i s(b_i)g(0,theta,i))*M + (1-sigmoid(a))*(1/Z)*sum_i u_i s(b_i)(G(M,theta,i)-G(0,theta,0)))\n"
00143
"Training the model can be done by maximum likelihood (minimizing the log of the\n"
00144
"density) or by minimizing the average of squared error (y-E[Y|theta])^2\n"
00145
"or a combination of the two (with the max_likelihood_vs_squared_error_balance option).\n"
00146
"The step 'centers' mu_i are initialized according to some rule, in the interval [0,maxY]:\n"
00147
" - uniform: at regular intervals in [0,maxY]\n"
00148
" - log-scale: as the exponential of values at regular intervals in [0,log(1+maxY)], minus 1.\n"
00149
"The c_i and b_i are initialized to inverse_softplus(1), and a using the empirical unconditional P(Y=0).\n"
00150
"For the output curve options (outputs_def='L',D','C', or 'S'), the lower_bound and upper_bound\n"
00151
"options of PDistribution are automatically set to 0 and maxY respectively.\n"
00152 );
00153
00154 void ConditionalDensityNet::declareOptions(
OptionList& ol)
00155 {
00156
declareOption(ol,
"nhidden", &ConditionalDensityNet::nhidden, OptionBase::buildoption,
00157
" number of hidden units in first hidden layer (0 means no hidden layer)\n");
00158
00159
declareOption(ol,
"nhidden2", &ConditionalDensityNet::nhidden2, OptionBase::buildoption,
00160
" number of hidden units in second hidden layer (0 means no hidden layer)\n");
00161
00162
declareOption(ol,
"weight_decay", &ConditionalDensityNet::weight_decay, OptionBase::buildoption,
00163
" global weight decay for all layers\n");
00164
00165
declareOption(ol,
"bias_decay", &ConditionalDensityNet::bias_decay, OptionBase::buildoption,
00166
" global bias decay for all layers\n");
00167
00168
declareOption(ol,
"layer1_weight_decay", &ConditionalDensityNet::layer1_weight_decay, OptionBase::buildoption,
00169
" Additional weight decay for the first hidden layer. Is added to weight_decay.\n");
00170
declareOption(ol,
"layer1_bias_decay", &ConditionalDensityNet::layer1_bias_decay, OptionBase::buildoption,
00171
" Additional bias decay for the first hidden layer. Is added to bias_decay.\n");
00172
00173
declareOption(ol,
"layer2_weight_decay", &ConditionalDensityNet::layer2_weight_decay, OptionBase::buildoption,
00174
" Additional weight decay for the second hidden layer. Is added to weight_decay.\n");
00175
00176
declareOption(ol,
"layer2_bias_decay", &ConditionalDensityNet::layer2_bias_decay, OptionBase::buildoption,
00177
" Additional bias decay for the second hidden layer. Is added to bias_decay.\n");
00178
00179
declareOption(ol,
"output_layer_weight_decay", &ConditionalDensityNet::output_layer_weight_decay, OptionBase::buildoption,
00180
" Additional weight decay for the output layer. Is added to 'weight_decay'.\n");
00181
00182
declareOption(ol,
"output_layer_bias_decay", &ConditionalDensityNet::output_layer_bias_decay, OptionBase::buildoption,
00183
" Additional bias decay for the output layer. Is added to 'bias_decay'.\n");
00184
00185
declareOption(ol,
"direct_in_to_out_weight_decay", &ConditionalDensityNet::direct_in_to_out_weight_decay, OptionBase::buildoption,
00186
" Additional weight decay for the direct in-to-out layer. Is added to 'weight_decay'.\n");
00187
00188
declareOption(ol,
"L1_penalty", &ConditionalDensityNet::L1_penalty, OptionBase::buildoption,
00189
" should we use L1 penalty instead of the default L2 penalty on the weights? (default=0)\n");
00190
00191
declareOption(ol,
"direct_in_to_out", &ConditionalDensityNet::direct_in_to_out, OptionBase::buildoption,
00192
" should we include direct input to output connections? (default=0)\n");
00193
00194
declareOption(ol,
"optimizer", &ConditionalDensityNet::optimizer, OptionBase::buildoption,
00195
" specify the optimizer to use\n");
00196
00197
declareOption(ol,
"batch_size", &ConditionalDensityNet::batch_size, OptionBase::buildoption,
00198
" how many samples to use to estimate the avergage gradient before updating the weights\n"
00199
" 0 is equivalent to specifying training_set->length(); default=1 (stochastic gradient)\n");
00200
00201
declareOption(ol,
"maxY", &ConditionalDensityNet::maxY, OptionBase::buildoption,
00202
" maximum allowed value for Y. Default = 1.0 (data normalized in [0,1]\n");
00203
00204
declareOption(ol,
"thresholdY", &ConditionalDensityNet::thresholdY, OptionBase::buildoption,
00205
" threshold value of Y for which we might want to compute P(Y>thresholdY), with outputs_def='t'\n");
00206
00207
declareOption(ol,
"log_likelihood_vs_squared_error_balance", &ConditionalDensityNet::log_likelihood_vs_squared_error_balance,
00208 OptionBase::buildoption,
00209
" Relative weight given to negative log-likelihood (1- this weight given squared error). Default=1\n");
00210
00211
declareOption(ol,
"n_output_density_terms", &ConditionalDensityNet::n_output_density_terms,
00212 OptionBase::buildoption,
00213
" Number of terms (steps) in the output density function.\n");
00214
00215
declareOption(ol,
"steps_type", &ConditionalDensityNet::steps_type,
00216 OptionBase::buildoption,
00217
" The type of steps used to build the cumulative distribution.\n"
00218
" Allowed values are:\n"
00219
" - sigmoid_steps: g(y,theta,i) = sigmoid(s(c_i)*(y-mu_i))\n"
00220
" - sloped_steps: g(y,theta,i) = s(s(c_i)*(mu_i-y))-s(s(c_i)*(mu_i-y))\nDefault=sloped_steps\n");
00221
00222
declareOption(ol,
"centers_initialization", &ConditionalDensityNet::centers_initialization,
00223 OptionBase::buildoption,
00224
" How to initialize the step centers (mu_i). Allowed values are:\n"
00225
" - data: from the data at regular quantiles, with last one at maxY (default)\n"
00226
" - uniform: at regular intervals in [0,maxY]\n"
00227
" - log-scale: as the exponential of values at regular intervals in log-scale, using formula:\n"
00228
" i-th position = (exp(scale*(i+1-n_output_density_terms)/n_output_density_terms)-exp(-scale))/(1-exp(-scale))\n");
00229
declareOption(ol,
"curve_positions", &ConditionalDensityNet::curve_positions,
00230 OptionBase::buildoption,
00231
" How to choose the y-values for the probability curve (upper case output_def):\n"
00232
" - uniform: at regular intervals in [0,maxY]\n"
00233
" - log-scale: as the exponential of values at regular intervals in log-scale, using formula:\n"
00234
" i-th position = (exp(scale*(i+1-n_output_density_terms)/n_output_density_terms)-exp(-scale))/(1-exp(-scale))\n");
00235
declareOption(ol,
"scale", &ConditionalDensityNet::scale,
00236 OptionBase::buildoption,
00237
" scale used in the log-scale formula for centers_initialization and curve_positions");
00238
00239
declareOption(ol,
"unconditional_p0", &ConditionalDensityNet::unconditional_p0, OptionBase::buildoption,
00240
" approximate unconditional probability of Y=0 (mass point), used\n"
00241
" to initialize the parameters.\n");
00242
00243
declareOption(ol,
"mu_is_fixed", &ConditionalDensityNet::mu_is_fixed, OptionBase::buildoption,
00244
" whether to keep the step centers (mu[i]) fixed or to learn them.\n");
00245
00246
declareOption(ol,
"separate_mass_point", &ConditionalDensityNet::separate_mass_point, OptionBase::buildoption,
00247
" whether to model separately the mass point at the origin.\n");
00248
00249
declareOption(ol,
"initial_hardness", &ConditionalDensityNet::initial_hardness, OptionBase::buildoption,
00250
" value that scales softplus(c).\n");
00251
00252
declareOption(ol,
"c_penalization", &ConditionalDensityNet::c_penalization, OptionBase::buildoption,
00253
" the penalization coefficient for the 'c' output of the neural network");
00254
00255
declareOption(ol,
"generate_precision", &ConditionalDensityNet::generate_precision, OptionBase::buildoption,
00256
" precision when generating a new sample\n");
00257
00258
declareOption(ol,
"paramsvalues", &ConditionalDensityNet::paramsvalues, OptionBase::learntoption,
00259
" The learned neural network parameter vector\n");
00260
00261
declareOption(ol,
"unconditional_cdf", &ConditionalDensityNet::unconditional_cdf, OptionBase::learntoption,
00262
" Unconditional cumulative distribution function.\n");
00263
00264
declareOption(ol,
"unconditional_delta_cdf", &ConditionalDensityNet::unconditional_delta_cdf, OptionBase::learntoption,
00265
" Variations of the cdf from one step center to the next (this is u_i in above eqns).\n");
00266
00267
declareOption(ol,
"mu", &ConditionalDensityNet::mu, OptionBase::learntoption,
00268
" Step centers.\n");
00269
00270
declareOption(ol,
"y_values", &ConditionalDensityNet::y_values, OptionBase::learntoption,
00271
" Values of Y at which the cumulative (or density or survival) curves are computed if required.\n");
00272
00273 inherited::declareOptions(ol);
00274 }
00275
00276
00277
00278
00279
00280
00281
00282
00283
00284
00285
00286
00287
00288
00289
00290
00291
00292
00294
00296 void ConditionalDensityNet::build_()
00297 {
00298
if(inputsize_>=0 && targetsize_>=0 && weightsize_>=0)
00299 {
00300 lower_bound = 0;
00301 upper_bound =
maxY;
00302
int n_output_parameters =
mu_is_fixed?(1+
n_output_density_terms*2):(1+
n_output_density_terms*3);
00303
00304
if (n_curve_points<0)
00305 n_curve_points =
n_output_density_terms+1;
00306
00307
00308
input =
Var(n_input,
"input");
00309
output =
input;
00310
params.
resize(0);
00311
00312
00313
if(
nhidden>0)
00314 {
00315
w1 = Var(1+n_input,
nhidden,
"w1");
00316
output =
tanh(
affine_transform(
output,
w1));
00317
params.
append(
w1);
00318 }
00319
00320
00321
if(
nhidden2>0)
00322 {
00323
w2 = Var(1+
nhidden,
nhidden2,
"w2");
00324
output =
tanh(
affine_transform(
output,
w2));
00325
params.
append(
w2);
00326 }
00327
00328
if (
nhidden2>0 &&
nhidden==0)
00329
PLERROR(
"ConditionalDensityNet:: can't have nhidden2 (=%d) > 0 while nhidden=0",
nhidden2);
00330
00331
if (nhidden==-1)
00332
00333 {
00334
wout = Var(1, n_output_parameters,
"wout");
00335
output =
transpose(
wout);
00336 }
00337
00338
else
00339 {
00340
wout = Var(1+
output->size(), n_output_parameters,
"wout");
00341
output =
affine_transform(
output,
wout);
00342 }
00343
params.
append(
wout);
00344
00345
00346
if(
direct_in_to_out)
00347 {
00348
wdirect = Var(n_input, n_output_parameters,
"wdirect");
00349
00350
output +=
transposeProduct(
wdirect, input);
00351
params.
append(
wdirect);
00352 }
00353
00354
00355
00356
00357
00358
target = Var(n_target,
"target");
00359
00360
if(weightsize_>0)
00361 {
00362
if (weightsize_!=1)
00363
PLERROR(
"ConditionalDensityNet: expected weightsize to be 1 or 0 (or unspecified = -1, meaning 0), got %d",weightsize_);
00364
sampleweight = Var(1,
"weight");
00365 }
00366
00367
00368
int i=0;
00369
a =
output[i++];
a->setName(
"a");
00370
00371
b =
new SubMatVariable(output,i,0,n_output_density_terms,1);
00372
b->setName(
"b");
00373 i+=n_output_density_terms;
00374
00375
c =
new SubMatVariable(output,i,0,n_output_density_terms,1);
00376
c->setName(
"c");
00377
00378
00379
00380
if (!
mu || (
mu->
length()!=n_output_density_terms && train_set))
00381 {
00382
if (
mu_is_fixed)
00383
00384
mu = Var(n_output_density_terms,1);
00385
else
00386 {
00387 i+=n_output_density_terms;
00388
00389
mu =
new SubMatVariable(output,i,0,n_output_density_terms,1);
00390 }
00391 }
00392
mu->setName(
"mu");
00393
00394
00395
00396
00397 Var nll;
00398 Var max_y =
var(maxY);
00399 Var left_side =
vconcat(
var(0.0) & (
new SubMatVariable(
mu,0,0,n_output_density_terms-1,1)));
00400
centers =
target-mu;
00401
centers_M = max_y-mu;
00402
unconditional_cdf.
resize(n_output_density_terms);
00403
if (
unconditional_delta_cdf)
00404 {
00405
00406
if (
unconditional_delta_cdf.
length()!=n_output_density_terms)
00407
unconditional_delta_cdf->resize(n_output_density_terms,1);
00408 }
00409
else
00410
unconditional_delta_cdf = Var(n_output_density_terms,1);
00411
initial_hardnesses =
var(
initial_hardness) / (mu - left_side);
00412
pos_b =
softplus(
b)*
unconditional_delta_cdf;
00413
pos_c =
softplus(
c)*
initial_hardnesses;
00414 Var scaled_centers =
pos_c*
centers;
00415
00416 Var scaled_centers_M =
pos_c*
centers_M;
00417
00418 Var scaled_centers_0 = -
pos_c*mu;
00419 Var lhopital, inverse_denominator, density_numerator;
00420
if (
separate_mass_point)
00421 {
00422
pos_a =
sigmoid(
a);
00423
if (
steps_type==
"sigmoid_steps")
00424 {
00425
steps =
sigmoid(scaled_centers);
00426
00427
steps_M =
sigmoid(scaled_centers_M);
00428
steps_0 =
sigmoid(scaled_centers_0);
00429
00430
steps_gradient =
pos_c*
steps*(1-
steps);
00431
steps_integral = (
softplus(scaled_centers_M) -
softplus(scaled_centers_0))/
pos_c;
00432
delta_steps = centers_M*
steps_M + mu*
sigmoid(scaled_centers_0);
00433 }
00434
else if (
steps_type==
"sloped_steps")
00435 {
00436
steps =
soft_slope(
target,
pos_c, left_side, mu);
00437
steps_M =
soft_slope(max_y,
pos_c, left_side, mu);
00438
steps_0 =
soft_slope(
var(0.0),
pos_c, left_side, mu);
00439
steps_gradient =
d_soft_slope(
target,
pos_c, left_side, mu);
00440
steps_integral =
soft_slope_integral(
pos_c,left_side,mu,0.0,maxY);
00441
delta_steps =
soft_slope_limit(
target,
pos_c, left_side, mu);
00442 }
00443
else PLERROR(
"ConditionalDensityNet::build, steps_type option value unknown: %s",
steps_type.c_str());
00444
00445 density_numerator =
dot(
pos_b,
steps_gradient);
00446
cum_denominator =
dot(
pos_b,
positive(
steps_M-
steps_0));
00447 inverse_denominator = 1.0/
cum_denominator;
00448
cum_numerator =
dot(
pos_b,(
steps-steps_0));
00449
cumulative =
pos_a + (1-
pos_a) *
cum_numerator * inverse_denominator;
00450
density = density_numerator * inverse_denominator;
00451
00452 lhopital =
ifThenElse(
isAboveThreshold(
pos_c,1e-20),
steps_integral,
delta_steps);
00453
expected_value = max_y - ((
pos_a-(1-
pos_a)*inverse_denominator*
dot(
pos_b,steps_0))*max_y +
00454 (1-
pos_a)*
dot(
pos_b,lhopital)*inverse_denominator);
00455
mass_cost = -
log(
ifThenElse(
isAboveThreshold(
target,0.0,1,0,
true),(1-
pos_a),
pos_a));
00456
pos_y_cost =
ifThenElse(
isAboveThreshold(
target,0.0,1,0,
true),-
log(
density),
var(0.0));
00457 nll = -
log(
ifThenElse(
isAboveThreshold(
target,0.0,1,0,
true),
density*(1-
pos_a),pos_a));
00458 }
00459
else
00460 {
00461
pos_a =
var(0.0);
00462
if (
steps_type==
"sigmoid_steps")
00463 {
00464
steps =
sigmoid(scaled_centers);
00465
00466
steps_M =
sigmoid(scaled_centers_M);
00467
steps_0 =
sigmoid(scaled_centers_0);
00468
00469
steps_gradient =
pos_c*
steps*(1-
steps);
00470
steps_integral = (
softplus(scaled_centers_M) -
softplus(scaled_centers_0))/
pos_c;
00471
delta_steps = centers_M*
steps_M + mu*
sigmoid(scaled_centers_0);
00472 }
00473
else if (
steps_type==
"sloped_steps")
00474 {
00475
steps =
soft_slope(
target,
pos_c, left_side, mu);
00476
steps_M =
soft_slope(max_y,
pos_c, left_side, mu);
00477
steps_0 =
soft_slope(
var(0.0),
pos_c, left_side, mu);
00478
steps_gradient =
d_soft_slope(
target,
pos_c, left_side, mu);
00479
steps_integral =
soft_slope_integral(
pos_c,left_side,mu,0.0,maxY);
00480
delta_steps =
soft_slope_limit(
target,
pos_c, left_side, mu);
00481 }
00482
else PLERROR(
"ConditionalDensityNet::build, steps_type option value unknown: %s",
steps_type.c_str());
00483
00484 density_numerator =
dot(
pos_b,
steps_gradient);
00485
cum_denominator =
dot(
pos_b,
steps_M -
steps_0);
00486 inverse_denominator = 1.0/
cum_denominator;
00487
cum_numerator =
dot(
pos_b,
steps - steps_0);
00488
cumulative =
cum_numerator * inverse_denominator;
00489
density = density_numerator * inverse_denominator;
00490
00491 lhopital =
ifThenElse(
isAboveThreshold(
pos_c,1e-20),
steps_integral,
delta_steps);
00492
expected_value =
dot(
pos_b,lhopital)*inverse_denominator;
00493 nll = -
log(
ifThenElse(
isAboveThreshold(
target,0.0,1,0,
true),
density,
cumulative));
00494 }
00495 max_y->setName(
"maxY");
00496 left_side->setName(
"left_side");
00497
pos_a->setName(
"pos_a");
00498
pos_b->setName(
"pos_b");
00499
pos_c->setName(
"pos_c");
00500
steps->setName(
"steps");
00501
steps_M->setName(
"steps_M");
00502
steps_integral->setName(
"steps_integral");
00503
expected_value->setName(
"expected_value");
00504 density_numerator->setName(
"density_numerator");
00505
cum_denominator->setName(
"cum_denominator");
00506 inverse_denominator->setName(
"inverse_denominator");
00507
cum_numerator->setName(
"cum_numerator");
00508
cumulative->setName(
"cumulative");
00509
density->setName(
"density");
00510 lhopital->setName(
"lhopital");
00511
00512
00513
00514
00515
00516
00517
00518
00519
00520
costs.
resize(3);
00521
00522
costs[1] = nll;
00523
costs[2] =
square(
target-
expected_value);
00524
00525
if (
log_likelihood_vs_squared_error_balance==1)
00526
costs[0] =
costs[1];
00527
else if (
log_likelihood_vs_squared_error_balance==0)
00528 costs[0] = costs[2];
00529
else costs[0] =
log_likelihood_vs_squared_error_balance*costs[1]+
00530 (1-
log_likelihood_vs_squared_error_balance)*costs[2];
00531
if (
c_penalization > 0) {
00532 costs[0] = costs[0] +
c_penalization *
sumsquare(
c);
00533 }
00534
00535
00536
00537
00538
00539
00540
00541
00542
00543
00544
00545
penalties.
resize(0);
00546
if(
w1 && ((
layer1_weight_decay +
weight_decay)!=0 || (
layer1_bias_decay +
bias_decay)!=0))
00547
penalties.
append(
affine_transform_weight_penalty(
w1, (
layer1_weight_decay +
weight_decay), (
layer1_bias_decay +
bias_decay),
L1_penalty));
00548
if(
w2 && ((
layer2_weight_decay + weight_decay)!=0 || (
layer2_bias_decay + bias_decay)!=0))
00549
penalties.
append(
affine_transform_weight_penalty(
w2, (
layer2_weight_decay + weight_decay), (
layer2_bias_decay + bias_decay),
L1_penalty));
00550
if(
wout && ((
output_layer_weight_decay + weight_decay)!=0 || (
output_layer_bias_decay + bias_decay)!=0))
00551
penalties.
append(
affine_transform_weight_penalty(
wout, (
output_layer_weight_decay + weight_decay),
00552 (
output_layer_bias_decay + bias_decay),
L1_penalty));
00553
if(
wdirect && (
direct_in_to_out_weight_decay + weight_decay) != 0)
00554 {
00555
if (
L1_penalty)
00556
penalties.
append(
sumabs(
wdirect)*(
direct_in_to_out_weight_decay + weight_decay));
00557
else
00558
penalties.
append(
sumsquare(
wdirect)*(
direct_in_to_out_weight_decay + weight_decay));
00559 }
00560
00561
test_costs =
hconcat(costs);
00562
00563
00564
if(
penalties.
size() != 0) {
00565
00566
if (weightsize_>0)
00567
training_cost =
hconcat(
sampleweight*
sum(
hconcat(costs[0] &
penalties))
00568 & (
test_costs*
sampleweight));
00569
else {
00570
training_cost =
hconcat(
sum(
hconcat(costs[0] & penalties)) &
test_costs);
00571 }
00572 }
00573
else {
00574
00575
if(weightsize_>0) {
00576
training_cost =
test_costs*
sampleweight;
00577 }
else {
00578
training_cost =
test_costs;
00579 }
00580 }
00581
00582
training_cost->setName(
"training_cost");
00583
test_costs->setName(
"test_costs");
00584 output->setName(
"output");
00585
00586
00587
bool use_paramsvalues=(
bool)
paramsvalues && (
paramsvalues.
size() ==
params.
nelems());
00588
if(use_paramsvalues)
00589 {
00590
params <<
paramsvalues;
00591
initialize_mu(mu->value);
00592 }
00593
else
00594 {
00595
paramsvalues.
resize(
params.
nelems());
00596
initializeParams();
00597 }
00598
params.
makeSharedValue(
paramsvalues);
00599
00600
VarArray output_and_target = output &
target;
00601
output_and_target_values.
resize(output.length()+target.length());
00602 output_and_target.
makeSharedValue(
output_and_target_values);
00603
00604
cdf_f =
Func(output_and_target,
cumulative);
00605
mean_f = Func(output,expected_value);
00606
density_f = Func(output_and_target,
density);
00607
00608
00609
VarArray outvars;
00610
VarArray testinvars;
00611
invars.
resize(0);
00612
if(input)
00613 {
00614
invars.
push_back(input);
00615 testinvars.
push_back(input);
00616 }
00617
if(expected_value)
00618 {
00619 outvars.
push_back(expected_value);
00620 }
00621
if(target)
00622 {
00623
invars.
push_back(target);
00624 testinvars.
push_back(target);
00625 outvars.
push_back(target);
00626 }
00627
if(
sampleweight)
00628 {
00629
invars.
push_back(
sampleweight);
00630 }
00631
00632
VarArray outputs_array;
00633
00634
for (
unsigned int i=0;i<outputs_def.length();i++)
00635 {
00636
if (outputs_def[i]==
'e')
00637 outputs_array &= expected_value;
00638
else if (outputs_def[i]==
't')
00639 {
00640 Func survival_f(target&output,
var(1.0)-
cumulative);
00641 Var threshold_y(1,1);
00642 threshold_y->valuedata[0]=
thresholdY;
00643 outputs_array &= survival_f(threshold_y & output);
00644 }
00645
else if (outputs_def[i]==
'S' || outputs_def[i]==
'C' ||
00646 outputs_def[i]==
'L' || outputs_def[i]==
'D')
00647 {
00648 Func prob_f(target&output,outputs_def[i]==
'S'?(
var(1.0)-
cumulative):
00649 (outputs_def[i]==
'C'?cumulative:
00650 (outputs_def[i]==
'D'?density:log(
density))));
00651
y_values.
resize(n_curve_points);
00652
if (
curve_positions==
"uniform")
00653 {
00654
real delta = maxY/(n_curve_points-1);
00655
for (
int i=0;i<n_curve_points;i++)
00656 {
00657
y_values[i] =
var(i*delta);
00658
y_values[i]->setName(
"y"+
tostring(i));
00659 outputs_array &= prob_f(
y_values[i] & output);
00660 }
00661 }
else
00662 {
00663
real denom = 1.0/(1-
exp(-
scale));
00664
for (
int i=0;i<n_curve_points;i++)
00665 {
00666
y_values[i] =
var((
exp(
scale*(i-n_output_density_terms)/n_output_density_terms)-
exp(-
scale))*denom);
00667
y_values[i]->setName(
"y"+
tostring(i));
00668 outputs_array &= prob_f(
y_values[i] & output);
00669 }
00670 }
00671 }
else
00672 outputs_array &= expected_value;
00673
00674 }
00675
outputs =
hconcat(outputs_array);
00676
if (
mu_is_fixed)
00677
f = Func(input,
params&mu,
outputs);
00678
else
00679
f = Func(input,
params,
outputs);
00680
f->recomputeParents();
00681
00682
in2distr_f = Func(input,
pos_a);
00683
in2distr_f->recomputeParents();
00684
00685
if (
mu_is_fixed)
00686
test_costf = Func(testinvars,
params&mu,
outputs&
test_costs);
00687
else
00688
test_costf = Func(testinvars,
params,
outputs&test_costs);
00689
00690
if (use_paramsvalues)
00691
test_costf->recomputeParents();
00692 }
00693 PDistribution::finishConditionalBuild();
00694 }
00695
00696
00697
00698
00699
00700
00701
00702
00703
00704
00705
00706
00707
00708
00709
00710
00711
00712 TVec<string> ConditionalDensityNet::getTrainCostNames()
const
00713
{
00714
if (
penalties.
size() > 0)
00715 {
00716
TVec<string> cost_funcs(4);
00717 cost_funcs[0]=
"training_criterion+penalty";
00718 cost_funcs[1]=
"training_criterion";
00719 cost_funcs[2]=
"NLL";
00720 cost_funcs[3]=
"mse";
00721
return cost_funcs;
00722 }
00723
else return getTestCostNames();
00724 }
00725
00726
00727
00728
00729
00730
00731
00732
00733
00734
00735
00736
00737
00738 void ConditionalDensityNet::build()
00739 {
00740 inherited::build();
00741
build_();
00742 }
00743
00744
extern void varDeepCopyField(
Var& field, CopiesMap& copies);
00745
00746 void ConditionalDensityNet::makeDeepCopyFromShallowCopy(map<const void*, void*>& copies)
00747 {
00748 inherited::makeDeepCopyFromShallowCopy(copies);
00749
varDeepCopyField(
input, copies);
00750
varDeepCopyField(
target, copies);
00751
varDeepCopyField(
sampleweight, copies);
00752
varDeepCopyField(
w1, copies);
00753
varDeepCopyField(
w2, copies);
00754
varDeepCopyField(
wout, copies);
00755
varDeepCopyField(
wdirect, copies);
00756
varDeepCopyField(
output, copies);
00757
varDeepCopyField(
outputs, copies);
00758
varDeepCopyField(
a, copies);
00759
varDeepCopyField(
pos_a, copies);
00760
varDeepCopyField(
b, copies);
00761
varDeepCopyField(
pos_b, copies);
00762
varDeepCopyField(
c, copies);
00763
varDeepCopyField(
pos_c, copies);
00764
varDeepCopyField(
density, copies);
00765
varDeepCopyField(
cumulative, copies);
00766
varDeepCopyField(
expected_value, copies);
00767
deepCopyField(
costs, copies);
00768
deepCopyField(
penalties, copies);
00769
varDeepCopyField(
training_cost, copies);
00770
varDeepCopyField(
test_costs, copies);
00771
deepCopyField(
invars, copies);
00772
deepCopyField(
params, copies);
00773
deepCopyField(
paramsvalues, copies);
00774
varDeepCopyField(
centers, copies);
00775
varDeepCopyField(
centers_M, copies);
00776
varDeepCopyField(
steps, copies);
00777
varDeepCopyField(
steps_M, copies);
00778
varDeepCopyField(
steps_0, copies);
00779
varDeepCopyField(
steps_gradient, copies);
00780
varDeepCopyField(
steps_integral, copies);
00781
varDeepCopyField(
delta_steps, copies);
00782
varDeepCopyField(
cum_numerator, copies);
00783
varDeepCopyField(
cum_denominator, copies);
00784
deepCopyField(
unconditional_cdf, copies);
00785
varDeepCopyField(
unconditional_delta_cdf, copies);
00786
varDeepCopyField(
initial_hardnesses, copies);
00787
varDeepCopyField(
prev_centers, copies);
00788
varDeepCopyField(
prev_centers_M, copies);
00789
varDeepCopyField(
scaled_prev_centers, copies);
00790
varDeepCopyField(
scaled_prev_centers_M, copies);
00791
varDeepCopyField(
minus_prev_centers_0, copies);
00792
varDeepCopyField(
minus_scaled_prev_centers_0, copies);
00793
deepCopyField(
y_values, copies);
00794
varDeepCopyField(
mu, copies);
00795
deepCopyField(
f, copies);
00796
deepCopyField(
test_costf, copies);
00797
deepCopyField(
output_and_target_to_cost, copies);
00798
deepCopyField(
cdf_f, copies);
00799
deepCopyField(
mean_f, copies);
00800
deepCopyField(
density_f, copies);
00801
deepCopyField(
in2distr_f, copies);
00802
deepCopyField(
output_and_target, copies);
00803
deepCopyField(
output_and_target_values, copies);
00804
varDeepCopyField(
totalcost, copies);
00805
varDeepCopyField(
mass_cost, copies);
00806
varDeepCopyField(
pos_y_cost, copies);
00807
deepCopyField(
optimizer, copies);
00808 }
00809
00810 void ConditionalDensityNet::setInput(
const Vec& in)
const
00811
{
00812
#ifdef BOUNDCHECK
00813
if (!
f)
00814
PLERROR(
"ConditionalDensityNet:setInput: build was not completed (maybe because training set was not provided)!");
00815
#endif
00816
in2distr_f->fprop(in,
pos_a->value);
00817 }
00818
00819 real ConditionalDensityNet::log_density(
const Vec& y)
const
00820
{
00821
static Vec d;
00822 d.
resize(1);
00823
target->value << y;
00824
density_f->fprop(
output_and_target_values, d);
00825
return log(d[0]);
00826 }
00827
00828 real ConditionalDensityNet::survival_fn(
const Vec& y)
const
00829
{
00830
return 1 -
cdf(y);
00831 }
00832
00833
00834 real ConditionalDensityNet::cdf(
const Vec& y)
const
00835
{
00836
Vec cum(1);
00837
target->value << y;
00838
cdf_f->fprop(
output_and_target_values,cum);
00839
#ifdef BOUNDCHECK
00840
if (cum[0] < -1e-3)
00841
PLERROR(
"In ConditionalDensityNet::cdf - The cdf is < 0");
00842
#endif
00843
return cum[0];
00844 }
00845
00846 void ConditionalDensityNet::expectation(
Vec& mu)
const
00847
{
00848 mu.
resize(n_target);
00849
mean_f->fprop(
output->value,mu);
00850 }
00851
00852 void ConditionalDensityNet::variance(
Mat& covar)
const
00853
{
00854
PLERROR(
"variance not implemented for ConditionalDensityNet");
00855 }
00856
00857 void ConditionalDensityNet::resetGenerator(
long g_seed)
const
00858
{
00859
manual_seed(g_seed);
00860 }
00861
00862 void ConditionalDensityNet::generate(
Vec& y)
const
00863
{
00864
real u =
uniform_sample();
00865 y.
resize(1);
00866
if (u<
pos_a->value[0])
00867 {
00868 y[0]=0;
00869
return;
00870 }
00871
00872
real y0=0;
00873
real y2=
maxY;
00874
real delta;
00875
real p;
00876
do
00877 {
00878 delta = y2 - y0;
00879 y[0] = y0 + delta*0.5;
00880 p =
cdf(y);
00881
if (p<u)
00882
00883 y0 = y[0];
00884
else
00885
00886 y2 = y[0];
00887 }
00888
while (delta >
generate_precision * maxY);
00889 }
00890
00891
00892
00893
00894
00895
00896
00897 void ConditionalDensityNet::initializeParams()
00898 {
00899
if (seed_>=0)
00900
manual_seed(seed_);
00901
else
00902
PLearn::seed();
00903
00904
00905
real delta = 1.0 / n_input;
00906
00907
00908
00909
00910
00911
00912
00913
00914
if(
nhidden>0)
00915 {
00916
00917
00918
fill_random_normal(
w1->value, 0, delta);
00919
if(
direct_in_to_out)
00920 {
00921
00922
fill_random_normal(
wdirect->value, 0, 0.01*delta);
00923
wdirect->matValue(0).clear();
00924 }
00925 delta = 1.0/
nhidden;
00926
w1->matValue(0).clear();
00927 }
00928
if(
nhidden2>0)
00929 {
00930
00931
00932 delta = 0.1/
nhidden2;
00933
fill_random_normal(
w2->value, 0, delta);
00934
w2->matValue(0).clear();
00935 }
00936
00937
fill_random_normal(
wout->value, 0, delta);
00938
Mat a_weights =
wout->matValue.
column(0);
00939
00940
00941
if (
centers_initialization!=
"data")
00942 {
00943
Vec output_biases =
wout->matValue(0);
00944
Vec mu_;
00945
int i=0;
00946
Vec a_ = output_biases.
subVec(i++,1);
00947
Vec b_ = output_biases.
subVec(i,
n_output_density_terms); i+=
n_output_density_terms;
00948
Vec c_ = output_biases.
subVec(i,n_output_density_terms); i+=n_output_density_terms;
00949
if (
mu_is_fixed)
00950 mu_ =
mu->value;
00951
else
00952 mu_ = output_biases.
subVec(i,n_output_density_terms); i+=n_output_density_terms;
00953
initialize_mu(mu_);
00954 b_.
fill(
inverse_softplus(1.0));
00955 c_.
fill(
inverse_softplus(1.0));
00956
if (
separate_mass_point)
00957 a_[0] =
unconditional_p0>0?
inverse_sigmoid(
unconditional_p0):-50;
00958
else a_[0] = -50;
00959
unconditional_delta_cdf->value.fill((1.0-
unconditional_p0)/n_output_density_terms);
00960 }
00961
00962
00963
if(
optimizer)
00964
optimizer->reset();
00965 }
00966
00967 void ConditionalDensityNet::initialize_mu(
Vec& mu_)
00968 {
00969
if (
centers_initialization==
"uniform")
00970 {
00971
real delta=
maxY/
n_output_density_terms;
00972
real center=delta;
00973
for (
int i=0;i<
n_output_density_terms;i++,
center+=delta)
00974 mu_[i]=
center;
00975 }
else if (
centers_initialization==
"log-scale")
00976 {
00977
real denom = 1.0/(1-
exp(-
scale));
00978
for (
int i=0;i<
n_output_density_terms;i++)
00979 mu_[i]=(
exp(
scale*(i+1-n_output_density_terms)/n_output_density_terms)-
exp(-
scale))*denom;
00980 }
else if (
centers_initialization!=
"data")
00981
PLERROR(
"ConditionalDensityNet::initialize_mu: unknown value %s for centers_initialization option",
00982
centers_initialization.c_str());
00983 }
00984
00986 void ConditionalDensityNet::forget()
00987 {
00988
if (train_set)
initializeParams();
00989 stage = 0;
00990 }
00991
00993 void ConditionalDensityNet::train()
00994 {
00995
int i=0, j=0;
00996
if(!train_set)
00997
PLERROR(
"In ConditionalDensityNet::train, you did not setTrainingSet");
00998
00999
if(!train_stats)
01000
PLERROR(
"In ConditionalDensityNet::train, you did not setTrainStatsCollector");
01001
01002
if (!already_sorted || n_margin > 0)
01003
PLERROR(
"In ConditionalDensityNet::train - Currently, can only be trained if the data is given as input, target");
01004
01005
if(
f.
isNull())
01006
build();
01007
01008
int l = train_set->
length();
01009
int nsamples =
batch_size>0 ?
batch_size : l;
01010
Func paramf =
Func(
invars,
training_cost);
01011
Var totalcost =
meanOf(train_set, paramf, nsamples);
01012
if(
optimizer)
01013 {
01014
optimizer->setToOptimize(
params, totalcost);
01015
optimizer->build();
01016 }
01017
01018
01019
int optstage_per_lstage = l/nsamples;
01020
01021
ProgressBar* pb = 0;
01022
if(report_progress)
01023 pb =
new ProgressBar(
"Training ConditionalDensityNet from stage " +
tostring(stage) +
" to " +
tostring(nstages), nstages-stage);
01024
01025
01026
static real weight;
01027
01028
if (stage==0)
01029 {
01030
Vec mu_values =
mu->value;
01031
unconditional_cdf.
clear();
01032
real sum_w=0;
01033
unconditional_p0 = 0;
01034
static StatsCollector sc;
01035
bool init_mu_from_data=
centers_initialization==
"data";
01036
if (init_mu_from_data)
01037 {
01038 sc.
maxnvalues =
min(l,100*
n_output_density_terms);
01039 sc.
build();
01040 sc.
forget();
01041 }
01042
Vec tmp1(
inputsize());
01043
Vec tmp2(
targetsize());
01044
for (i=0;i<l;i++)
01045 {
01046 train_set->
getExample(i, tmp1, tmp2, weight);
01047
input->value << tmp1.
subVec(0, n_input);
01048
target->value << tmp1.
subVec(n_input, n_target);
01049
real y =
target->valuedata[0];
01050
if (y < 0)
01051
PLERROR(
"In ConditionalDensityNet::train - Found a negative target");
01052
if (y >
maxY)
01053
PLERROR(
"In ConditionalDensityNet::train - Found a target > maxY");
01054
if (y == 0)
01055
unconditional_p0 += weight;
01056
if (init_mu_from_data)
01057 sc.
update(y,weight);
01058
else
01059
for (
int j=0;j<
n_output_density_terms;j++)
01060
if (y<=mu_values[j])
01061
unconditional_cdf[j] += weight;
01062 sum_w += weight;
01063 }
01064
static Mat cdf;
01065
unconditional_p0 *= 1.0/sum_w;
01066
if (init_mu_from_data)
01067 {
01068 cdf = sc.
cdf();
01069
int k=3;
01070
real mean_y = sc.
mean();
01071
01072
real current_mean_fraction = 0;
01073
real prev_cdf =
unconditional_p0;
01074
real prev_y = 0;
01075
for (
int j=0;j<
n_output_density_terms;j++)
01076 {
01077
real target_fraction = mean_y*(j+1.0)/n_output_density_terms;
01078
for (;
k<cdf.
length() && current_mean_fraction < target_fraction;
k++)
01079 {
01080 current_mean_fraction += (cdf(
k,0)+prev_y)*0.5*(cdf(
k,1)-prev_cdf);
01081 prev_cdf = cdf(
k,1);
01082 prev_y = cdf(
k,0);
01083 }
01084
if (j==n_output_density_terms-1)
01085 {
01086 mu_values[j]=
maxY;
01087
unconditional_cdf[j]=1.0;
01088 }
01089
else
01090 {
01091 mu_values[j]=cdf(
k,0);
01092
unconditional_cdf[j]=cdf(
k,1);
01093 }
01094 }
01095 }
01096
else
01097
for (j=0;j<
n_output_density_terms;j++)
01098
unconditional_cdf[j] *= 1.0/sum_w;
01099
01100
unconditional_delta_cdf->valuedata[0]=
unconditional_cdf[0]-
unconditional_p0;
01101
for (i=1;i<n_output_density_terms;i++)
01102
unconditional_delta_cdf->valuedata[i]=
unconditional_cdf[i]-
unconditional_cdf[i-1];
01103
01104
01105
Vec output_biases =
wout->matValue(0);
01106 i=0;
01107
Vec a_ = output_biases.
subVec(i++,1);
01108
Vec b_ = output_biases.
subVec(i,n_output_density_terms); i+=n_output_density_terms;
01109
Vec c_ = output_biases.
subVec(i,n_output_density_terms); i+=n_output_density_terms;
01110
Vec mu_;
01111
Vec s_c(n_output_density_terms);
01112
if (
mu_is_fixed)
01113 mu_ =
mu->value;
01114
else
01115 mu_ = output_biases.
subVec(i,n_output_density_terms); i+=n_output_density_terms;
01116 b_.
fill(
inverse_softplus(1.0));
01117
initialize_mu(mu_);
01118
for (i=0;i<n_output_density_terms;i++)
01119 {
01120
real prev_mu = i==0?0:mu_[i-1];
01121
real delta = mu_[i]-prev_mu;
01122 s_c[i] = delta>0?
initial_hardness/delta:-50;
01123 c_[i] =
inverse_softplus(1.0);
01124 }
01125
01126
if (centers_initialization!=
"data")
01127
unconditional_delta_cdf->value.fill(1.0/n_output_density_terms);
01128
real *dcdf =
unconditional_delta_cdf->valuedata;
01129
if (
separate_mass_point)
01130 a_[0] = unconditional_p0>0?
inverse_sigmoid(unconditional_p0):-50;
01131
else if (dcdf[0]==0)
01132 a_[0]=unconditional_p0>0?
inverse_softplus(unconditional_p0):-50;
01133
else
01134 {
01135
real s=0;
01136
if (
steps_type==
"sigmoid_steps")
01137
for (i=0;i<n_output_density_terms;i++)
01138 s+=dcdf[i]*(unconditional_p0*
sigmoid(s_c[i]*(
maxY-mu_[i]))-
sigmoid(-s_c[i]*mu_[i]));
01139
else
01140
for (i=0;i<n_output_density_terms;i++)
01141 {
01142
real prev_mu = i==0?0:mu_[i-1];
01143
real ss1 =
soft_slope(
maxY,s_c[i],prev_mu,mu_[i]);
01144
real ss2 =
soft_slope(0,s_c[i],prev_mu,mu_[i]);
01145 s+=dcdf[i]*(unconditional_p0*ss1 - ss2);
01146 }
01147
real sa=s/(1-unconditional_p0);
01148 a_[0]=sa>0?
inverse_softplus(sa):-50;
01149
01150
01151
01152
01153
01154
01155
01156
01157
01158
01159
01160
01161
01162
01163
01164
01165
01166
01167
01168
01169
01170
01171
01172
01173
01174
01175
01176
01177
01178
01179 }
01180
test_costf->recomputeParents();
01181
01182
01183
static bool display_graph =
false;
01184
if (display_graph)
f->fprop(
input->value,
outputs->value);
01185
01186
if (display_graph)
01187
displayFunction(
f,
true);
01188
if (display_graph)
01189
displayFunction(
test_costf,
true);
01190 }
01191
int initial_stage = stage;
01192
bool early_stop=
false;
01193
while(stage<nstages && !early_stop)
01194 {
01195
optimizer->nstages = optstage_per_lstage;
01196 train_stats->forget();
01197
optimizer->early_stop =
false;
01198 early_stop =
optimizer->optimizeN(*train_stats);
01199
01200
01201
01202
01203
static real verify_gradient = 0;
01204
if (verify_gradient)
01205 {
01206
if (
batch_size == 0)
01207 {
01208 cout <<
"OPTIMIZER" <<
endl;
01209
optimizer->verifyGradient(0.001);
01210 }
01211 }
01212
static bool display_graph =
false;
01213
if (display_graph)
01214
displayFunction(
f,
true);
01215
if (display_graph)
01216
displayFunction(
test_costf,
true);
01217
01218 train_stats->finalize();
01219
if(verbosity>2)
01220 cerr <<
"Epoch " << stage <<
" train objective: " << train_stats->getMean() <<
endl;
01221 ++stage;
01222
if(pb)
01223 pb->
update(stage-initial_stage);
01224 }
01225
if(verbosity>1)
01226 cerr <<
"EPOCH " << stage <<
" train objective: " << train_stats->getMean() <<
endl;
01227
01228
if(pb)
01229
delete pb;
01230
01231
test_costf->recomputeParents();
01232 }
01233
01234 }