00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
#include "SequentialValidation.h"
00039
#include <plearn/math/VecStatsCollector.h>
00040
00041
#include <plearn/vmat/FileVMatrix.h>
00042
#include <plearn_learners/testers/PTester.h>
00043
00044
namespace PLearn {
00045
using namespace std;
00046
00047
00048
PLEARN_IMPLEMENT_OBJECT(
00049 SequentialValidation,
00050
"The SequentialValidation class allows you to describe a typical "
00051
"sequential validation experiment that you wish to perform.",
00052
"NO HELP");
00053
00054 SequentialValidation::SequentialValidation()
00055 : init_train_size(1),
00056 last_test_time(-1),
00057 expdir(""),
00058 report_stats(true),
00059 save_final_model(true),
00060 save_initial_model(false),
00061 save_initial_seqval(true),
00062 save_data_sets(false),
00063 save_test_outputs(false),
00064 save_test_costs(false),
00065 save_stat_collectors(false),
00066 provide_learner_expdir(true)
00067 {}
00068
00069 void SequentialValidation::build_()
00070 {
00071
if (
expdir !=
"")
00072 {
00073
if(
pathexists(
expdir))
00074
PLERROR(
"Directory (or file) %s already exists. First move it out of the way.",
expdir.c_str());
00075
if(!
force_mkdir(
expdir))
00076
PLERROR(
"Could not create experiment directory %s",
expdir.c_str());
00077 }
00078 }
00079
00080 void SequentialValidation::build()
00081 {
00082 inherited::build();
00083
build_();
00084 }
00085
00086 void SequentialValidation::declareOptions(
OptionList& ol)
00087 {
00088
declareOption(ol,
"report_stats", &SequentialValidation::report_stats,
00089 OptionBase::buildoption,
00090
"If true, the computed global statistics specified in statnames will be saved in global_stats.pmat \n"
00091
"and the corresponding per-split statistics will be saved in split_stats.pmat \n"
00092
"For reference, all cost names (as given by the learner's getTrainCostNames() and getTestCostNames() ) \n"
00093
"will be reported in files train_cost_names.txt and test_cost_names.txt");
00094
00095
declareOption(ol,
"statnames", &SequentialValidation::statnames,
00096 OptionBase::buildoption,
00097
"A list of global statistics we are interested in.\n"
00098
"These are strings of the form S1[S2[dataset.cost_name]] where:\n"
00099
" - dataset is train or test1 or test2 ... (train being \n"
00100
" the first dataset in a split, test1 the second, ...) \n"
00101
" - cost_name is one of the training or test cost names (depending on dataset) understood \n"
00102
" by the underlying learner (see its getTrainCostNames and getTestCostNames methods) \n"
00103
" - S1 and S2 are a statistic, i.e. one of: E (expectation), V(variance), MIN, MAX, STDDEV, ... \n"
00104
" S2 is computed over the samples of a given dataset split. S1 is over the splits. \n");
00105
00106
declareOption(ol,
"timewise_statnames", &SequentialValidation::timewise_statnames,
00107 OptionBase::buildoption,
00108
"Statistics to be collected into a VecStatsCollector at each timestep.");
00109
00110
declareOption(ol,
"expdir", &SequentialValidation::expdir,
00111 OptionBase::buildoption,
00112
"Path of this experiment's directory in which to save all experiment results (will be created if it does not already exist). \n");
00113
00114
declareOption(ol,
"learner", &SequentialValidation::learner,
00115 OptionBase::buildoption,
00116
"The SequentialLearner to train/test. \n");
00117
00118
declareOption(ol,
"dataset", &SequentialValidation::dataset,
00119 OptionBase::buildoption,
00120
"The dataset to use for training/testing. \n");
00121
00122
declareOption(ol,
"init_train_size", &SequentialValidation::init_train_size,
00123 OptionBase::buildoption,
00124
"Size of the first training set. \n");
00125
00126
declareOption(ol,
"last_test_time", &SequentialValidation::last_test_time,
00127 OptionBase::buildoption,
00128
"The last time-step to use for testing (Default = -1, i.e. use all data)");
00129
00130
declareOption(ol,
"save_final_model", &SequentialValidation::save_final_model,
00131 OptionBase::buildoption,
00132
"If true, the final model will be saved in model.psave \n");
00133
00134
declareOption(ol,
"save_initial_model", &SequentialValidation::save_initial_model,
00135 OptionBase::buildoption,
00136
"If true, the initial model will be saved in initial_model.psave. \n");
00137
00138
declareOption(ol,
"save_initial_seqval", &SequentialValidation::save_initial_seqval,
00139 OptionBase::buildoption,
00140
"If true, this SequentialValidation object will be saved in sequential_validation.psave. \n");
00141
00142
declareOption(ol,
"save_data_sets", &SequentialValidation::save_data_sets,
00143 OptionBase::buildoption,
00144
"If true, the data sets (train/test) for each split will be saved. \n");
00145
00146
declareOption(ol,
"save_test_outputs", &SequentialValidation::save_test_outputs,
00147 OptionBase::buildoption,
00148
"If true, the outputs of the tests will be saved in test_outputs.pmat \n");
00149
00150
declareOption(ol,
"save_test_costs", &SequentialValidation::save_test_costs,
00151 OptionBase::buildoption,
00152
"If true, the costs of the tests will be saved in test_costs.pmat \n");
00153
00154
declareOption(ol,
"save_stat_collectors", &SequentialValidation::save_stat_collectors,
00155 OptionBase::buildoption,
00156
"If true, stat collectors of each data sets (train/test) will be saved for each split. \n");
00157
00158
declareOption(ol,
"provide_learner_expdir", &SequentialValidation::provide_learner_expdir,
00159 OptionBase::buildoption,
00160
"If true, learning results from the learner will be saved. \n");
00161
00162 inherited::declareOptions(ol);
00163 }
00164
00165 void SequentialValidation::run()
00166 {
00167
if (
expdir==
"")
00168
PLERROR(
"No expdir specified for SequentialValidation.");
00169
if (!
learner)
00170
PLERROR(
"No learner specified for SequentialValidation.");
00171
00172
00173
learner->setTrainingSet(
dataset,
false);
00174
00175
setExperimentDirectory(
append_slash(
expdir) );
00176
00177
00178
if (
save_initial_seqval)
00179
PLearn::save(
expdir+
"sequential_validation.psave", *
this);
00180
00181
TVec<string> testcostnames =
learner->getTestCostNames();
00182
TVec<string> traincostnames =
learner->getTrainCostNames();
00183
00184
int outputsize =
learner->outputsize();
00185
int nstats =
statnames.
length();
00186
int timewise_nstats =
timewise_statnames.
length();
00187
00188
TVec< PP<VecStatsCollector> > stcol(2);
00189
00190
00191
PP<VecStatsCollector> train_stats =
new VecStatsCollector();
00192 train_stats->setFieldNames(traincostnames);
00193
learner->setTrainStatsCollector(train_stats);
00194 stcol[0] = train_stats;
00195
00196
00197
PP<VecStatsCollector> test_stats =
new VecStatsCollector();
00198 test_stats->setFieldNames(testcostnames);
00199 stcol[1] = test_stats;
00200
00201
00202
PP<VecStatsCollector> sequence_stats =
new VecStatsCollector();
00203
00204
00205
TVec<StatSpec> statspecs(nstats);
00206
for (
int k=0;
k<nstats;
k++)
00207 statspecs[
k].init(
statnames[
k]);
00208
00209
00210
PP<VecStatsCollector> timewise_stats =
new VecStatsCollector();
00211
00212
00213
TVec<StatSpec> timewise_statspecs(timewise_nstats);
00214
for (
int k=0;
k<timewise_nstats; ++
k)
00215 timewise_statspecs[
k].init(
timewise_statnames[
k]);
00216
00217
VMat global_stats_vm;
00218
VMat split_stats_vm;
00219
VMat timewise_stats_vm;
00220
00221
00222
if (
report_stats)
00223 {
00224
saveStringInFile(
expdir+
"train_cost_names.txt",
join(traincostnames,
"\n")+
"\n");
00225
saveStringInFile(
expdir+
"test_cost_names.txt",
join(testcostnames,
"\n")+
"\n");
00226
00227 global_stats_vm =
new FileVMatrix(
expdir+
"global_stats.pmat", 0, nstats);
00228
for(
int k=0;
k<nstats;
k++)
00229 global_stats_vm->declareField(
k,statspecs[
k].statName());
00230 global_stats_vm->saveFieldInfos();
00231
00232 split_stats_vm =
new FileVMatrix(
expdir+
"sequence_stats.pmat", 0,
00233 1+nstats);
00234 split_stats_vm->declareField(0,
"splitnum");
00235
for(
int k=0;
k<nstats;
k++)
00236 split_stats_vm->declareField(
k+1,statspecs[
k].setname +
"." + statspecs[
k].intstatname);
00237 split_stats_vm->saveFieldInfos();
00238
00239
if (timewise_nstats > 0) {
00240 timewise_stats_vm =
new FileVMatrix(
expdir+
"timewise_stats.pmat", 0,
00241 timewise_nstats);
00242
for (
int k=0;
k<timewise_nstats; ++
k)
00243 timewise_stats_vm->declareField(
k, timewise_statspecs[
k].statName());
00244 timewise_stats_vm->saveFieldInfos();
00245 }
00246 }
00247
00248
00249
learner->build();
00250
00251
VMat test_outputs;
00252
VMat test_costs;
00253
if (
save_test_outputs)
00254 test_outputs =
new FileVMatrix(
expdir+
"/test_outputs.pmat",0,outputsize);
00255
if (
save_test_costs)
00256 test_costs =
new FileVMatrix(
expdir+
"/test_costs.pmat",0,testcostnames);
00257
00258
00259
int maxt = (
last_test_time >= 0? last_test_time :
dataset.
length() - 1);
00260
int splitnum = 0;
00261
double weight;
00262
Vec input, target;
00263
Vec output(
learner->outputsize());
00264
Vec costs(
learner->nTestCosts());
00265
00266
for (
int t=
init_train_size; t <= maxt; t++, splitnum++)
00267 {
00268
#ifdef DEBUG
00269
cout <<
"SequentialValidation::run() -- sub_train.length = " << t <<
" et sub_test.length = " << t+horizon <<
endl;
00270
#endif
00271
VMat sub_train =
dataset.
subMatRows(0,t);
00272
VMat sub_test =
dataset.
subMatRows(0, t+1);
00273
VMat only_test =
dataset.
subMatRows(t, 1);
00274
00275
string splitdir =
expdir+
"train_t="+
tostring(t)+
"/";
00276
if (
save_data_sets ||
save_initial_model ||
save_stat_collectors ||
save_final_model)
00277
force_mkdir(splitdir);
00278
if (
save_data_sets)
00279
PLearn::save(splitdir+
"training_set.psave", sub_train);
00280
if (
save_initial_model)
00281
PLearn::save(splitdir+
"initial_learner.psave",
learner);
00282
00283
00284 train_stats->forget();
00285
learner->setTrainingSet(sub_train,
false);
00286
learner->train();
00287 train_stats->finalize();
00288
00289
if (
save_stat_collectors)
00290
PLearn::save(splitdir+
"train_stats.psave",train_stats);
00291
if (
save_final_model)
00292
PLearn::save(splitdir+
"final_learner.psave",
learner);
00293
00294
00295
00296
dataset.
getExample(t, input, target, weight);
00297 test_stats->forget();
00298
learner->setTestSet(sub_test);
00299
learner->setCurrentTestTime(t);
00300
learner->computeOutputAndCosts(input, target, output, costs);
00301 test_stats->update(costs);
00302 test_stats->finalize();
00303
00304
00305
if (
save_data_sets)
00306
PLearn::save(splitdir+
"test_set.psave", sub_test);
00307
if (test_outputs)
00308 test_outputs->appendRow(output);
00309
if (test_costs)
00310 test_costs->appendRow(costs);
00311
if (
save_stat_collectors)
00312
PLearn::save(splitdir+
"test_stats.psave",test_stats);
00313
00314
Vec splitres(1+nstats);
00315 splitres[0] = splitnum;
00316
00317
00318
for(
int k=0;
k<nstats;
k++)
00319 {
00320
StatSpec& sp = statspecs[
k];
00321
if (sp.
setnum>=stcol.
length())
00322
PLERROR(
"SequentialValidation::run, trying to access a test set (test%d) beyond the last one (test%d)",
00323 sp.
setnum, stcol.
length()-1);
00324 splitres[
k+1] = stcol[sp.
setnum]->getStat(sp.
intstatname);
00325 }
00326
00327
if (split_stats_vm)
00328 split_stats_vm->appendRow(splitres);
00329
00330
00331 sequence_stats->update(splitres.
subVec(1,nstats));
00332
00333
00334
00335
00336
if (timewise_stats_vm) {
00337
Vec timewise_res(timewise_nstats);
00338
for (
int k=0;
k<timewise_nstats; ++
k) {
00339
StatSpec& sp = timewise_statspecs[
k];
00340
if (sp.
setnum>=stcol.
length())
00341
PLERROR(
"SequentialValidation::run, trying to access a test set "
00342
"(test%d) beyond the last one (test%d)",
00343 sp.
setnum, stcol.
length()-1);
00344 timewise_res[
k] = stcol[sp.
setnum]->getStat(sp.
intstatname);
00345 }
00346 timewise_stats->update(timewise_res);
00347
for (
int k=0;
k<timewise_nstats; ++
k)
00348 timewise_res[
k] =
00349 timewise_stats->getStats(
k).getStat(timewise_statspecs[
k].extstat);
00350 timewise_stats_vm->appendRow(timewise_res);
00351 }
00352 }
00353
00354 sequence_stats->finalize();
00355
00356
Vec global_result(nstats);
00357
for (
int k=0;
k<nstats;
k++)
00358 global_result[
k] = sequence_stats->getStats(
k).getStat(statspecs[
k].extstat);
00359
00360
if (global_stats_vm)
00361 global_stats_vm->appendRow(global_result);
00362
00363
reportStats(global_result);
00364 }
00365
00366 void SequentialValidation::setExperimentDirectory(
const string& _expdir)
00367 {
00368
expdir = _expdir;
00369
if(
provide_learner_expdir)
00370
learner->setExperimentDirectory(
append_slash(
expdir)+
"Model");
00371 }
00372
00373 void SequentialValidation::reportStats(
const Vec& global_result)
00374 {
00375
if (!
report_stats)
00376
return;
00377
00378
saveAscii(
expdir+
"global_result.avec", global_result);
00379
00380
00381 }
00382
00383 }
00384