00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00040
#include <plearn/math/VecStatsCollector.h>
00041
#include <plearn/vmat/FileVMatrix.h>
00042
#include "PTester.h"
00043
00044
namespace PLearn {
00045
using namespace std;
00046
00047 TVec<string>
addprepostfix(
const string& prefix,
const TVec<string>& names,
const string& postfix)
00048 {
00049 TVec<string> newnames(names.size());
00050 TVec<string>::const_iterator it = names.begin();
00051 TVec<string>::iterator newit = newnames.begin();
00052
while(it!=names.end())
00053 {
00054 *newit = prefix + *it + postfix;
00055 ++it;
00056 ++newit;
00057 }
00058
return newnames;
00059 }
00060
00061
template<
class T> TVec<T>
operator&(
const T& x,
const TVec<T>& v)
00062 {
00063
int l = v.size();
00064 TVec<T> res(1+l);
00065 res[0] =
x;
00066 res.subVec(1,l) << v;
00067
return res;
00068 }
00069
00070 PTester::PTester()
00071 : provide_learner_expdir(false),
00072 report_stats(true),
00073 save_data_sets(false),
00074 save_initial_learners(false),
00075 save_initial_tester(true),
00076 save_learners(true),
00077 save_stat_collectors(true),
00078 save_test_costs(false),
00079 save_test_outputs(false),
00080 train(true)
00081 {}
00082
00083
PLEARN_IMPLEMENT_OBJECT(PTester,
"Manages a learning experiment, with training and estimation of generalization error.",
00084
"The PTester class allows you to describe a typical learning experiment that you wish to perform, \n"
00085
"as a training/testing of a learning algorithm on a particular dataset.\n"
00086
"The splitter is used to obtain one or several (such as for k-fold) splits of the dataset \n"
00087
"and training/testing is performed on each split. \n"
00088
"Requested statistics are computed, and all requested results are written in an appropriate \n"
00089
"file inside the specified experiment directory. \n"
00090
"Statistics can be either specified entirely from the 'statnames' option, or built from\n"
00091
"'statnames' and 'statmask'. For instance, one may set:\n"
00092
" statnames = [ \"NLL\" \"mse\" ]\n"
00093
" statmask = [ [ \"E[*]\" ] [ \"test1.*\" \"test2.*\" ] [ \"E[*]\" \"STDERROR[*]\" ] ]\n"
00094
"and this will compute:\n"
00095
" E[test1.E[NLL]], STDERROR[test1.E[NLL]], E[test2.E[NLL]], STDERROR[test2.E[NLL]]\n"
00096
" E[test1.E[mse]], STDERROR[test1.E[mse]], E[test2.E[mse]], STDERROR[test2.E[mse]]\n"
00097 );
00098
00099
00100
void PTester::declareOptions(OptionList& ol)
00101 {
00102
declareOption(ol,
"expdir", &PTester::expdir, OptionBase::buildoption,
00103
"Path of this tester's directory in which to save all tester results.\n"
00104
"The directory will be created if it does not already exist.\n"
00105
"If this is an empty string, no directory is created and no output file is generated.\n");
00106
declareOption(ol,
"dataset", &PTester::dataset, OptionBase::buildoption,
00107
"The dataset to use to generate splits. \n"
00108
"(This is ignored if your splitter is an ExplicitSplitter)\n"
00109
"Data-sets are seen as matrices whose columns or fields are layed out as \n"
00110
"follows: a number of input fields, followed by (optional) target fields, \n"
00111
"followed by a (optional) weight field (to weigh each example).\n"
00112
"The sizes of those areas are given by the VMatrix options \n"
00113
"inputsize targetsize, and weightsize, which are typically used by the \n"
00114
"learner upon building\n");
00115
declareOption(ol,
"splitter", &PTester::splitter, OptionBase::buildoption,
00116
"The splitter to use to generate one or several train/test tuples from the dataset.");
00117
declareOption(ol,
"statnames", &PTester::statnames, OptionBase::buildoption,
00118
"A list of global statistics we are interested in.\n"
00119
"These are strings of the form S1[S2[dataset.cost_name]] where:\n"
00120
" - dataset is train or test1 or test2 ... (train being \n"
00121
" the first dataset in a split, test1 the second, ...) \n"
00122
" - cost_name is one of the training or test cost names (depending on dataset) understood \n"
00123
" by the underlying learner (see its getTrainCostNames and getTestCostNames methods) \n"
00124
" - S1 and S2 are a statistic, i.e. one of: E (expectation), V(variance), MIN, MAX, STDDEV, ... \n"
00125
" S2 is computed over the samples of a given dataset split. S1 is over the splits. \n");
00126
declareOption(ol,
"statmask", &PTester::statmask, OptionBase::buildoption,
00127
"A list of lists of masks. If provided, each of the lists is used to compose the statnames_processed.\n"
00128
"If not provided the statnames are those in the 'statnames' list. See the class help for an example.\n");
00129
declareOption(ol,
"learner", &PTester::learner, OptionBase::buildoption,
00130
"The learner to train/test.\n");
00131
declareOption(ol,
"report_stats", &PTester::report_stats, OptionBase::buildoption,
00132
"If true, the computed global statistics specified in statnames will be saved in global_stats.pmat \n"
00133
"and the corresponding per-split statistics will be saved in split_stats.pmat \n"
00134
"For reference, all cost names (as given by the learner's getTrainCostNames() and getTestCostNames() ) \n"
00135
"will be reported in files train_cost_names.txt and test_cost_names.txt");
00136
declareOption(ol,
"save_initial_tester", &PTester::save_initial_tester, OptionBase::buildoption,
00137
"If true, this PTester object will be saved in its initial state in tester.psave \n"
00138
"Thus if the initial .plearn file gets lost, or modified, we can always see what this tester was.\n");
00139
declareOption(ol,
"save_stat_collectors", &PTester::save_stat_collectors, OptionBase::buildoption,
00140
"If true, stat collectors for split#k will be saved in Split#k/train_stats.psave and Split#k/test#i_stats.psave");
00141
declareOption(ol,
"save_learners", &PTester::save_learners, OptionBase::buildoption,
00142
"If true, the final trained learner for split#k will be saved in Split#k/final_learner.psave");
00143
declareOption(ol,
"save_initial_learners", &PTester::save_initial_learners, OptionBase::buildoption,
00144
"If true, the initial untrained learner for split#k (just after forget() has been called) will be saved in Split#k/initial_learner.psave");
00145
declareOption(ol,
"save_data_sets", &PTester::save_data_sets, OptionBase::buildoption,
00146
"If true, the data set generated for split #k will be saved as Split#k/training_set.psave Split#k/test1_set.psave ...");
00147
declareOption(ol,
"save_test_outputs", &PTester::save_test_outputs, OptionBase::buildoption,
00148
"If true, the outputs of the test for split #k will be saved in Split#k/test#i_outputs.pmat");
00149
declareOption(ol,
"save_test_costs", &PTester::save_test_costs, OptionBase::buildoption,
00150
"If true, the costs of the test for split #k will be saved in Split#k/test#i_costs.pmat");
00151
declareOption(ol,
"provide_learner_expdir", &PTester::provide_learner_expdir, OptionBase::buildoption,
00152
"If true, each learner to be trained will have its experiment directory set to Split#k/LearnerExpdir/");
00153
declareOption(ol,
"train", &PTester::train, OptionBase::buildoption,
00154
"If true, the learners are trained, otherwise only tested (in that case it is advised\n"
00155
"to load an already trained learner in the 'learner' field)");
00156
declareOption(ol,
"template_stats_collector", &PTester::template_stats_collector, OptionBase::buildoption,
00157
"If provided, this instance of a subclass of VecStatsCollector will be used as a template\n"
00158
"to build all the stats collector used during training and testing of the learner");
00159
declareOption(ol,
"global_template_stats_collector", &PTester::global_template_stats_collector, OptionBase::buildoption,
00160
"If provided, this instance of a subclass of VecStatsCollector will be used as a template\n"
00161
"to build all the global stats collector that collects statistics over splits");
00162
declareOption(ol,
"final_commands", &PTester::final_commands, OptionBase::buildoption,
00163
"If provided, the shell commands given will be executed after training is completed");
00164 inherited::declareOptions(ol);
00165 }
00166
00167
void PTester::build_()
00168 {
00169
if(expdir!=
"")
00170 {
00171
if(
pathexists(expdir))
00172
PLERROR(
"Directory (or file) %s already exists. First move it out of the way.",expdir.c_str());
00173
if(!
force_mkdir(expdir))
00174
PLERROR(
"In PTester Could not create experiment directory %s",expdir.c_str());
00175 expdir =
abspath(expdir);
00176 }
00177
00178 statnames_processed.resize(statnames.length());
00179 statnames_processed << statnames;
00180
if (statmask) {
00181 TVec< TVec<string> > temp(2);
00182
int d = 0;
00183 temp[d] = statnames_processed;
00184
for (
int i=0;i<statmask.length();i++) {
00185 temp[1-d].resize(temp[d].length() * statmask[i].length());
00186
00187
for (
int j=0;j<statmask[i].length();j++) {
00188
string mask = statmask[i][j];
00189 size_t pos;
00190
if ((pos=mask.find(
'*'))==string::npos) {
00191
00192
00193
for (
int k = 0;
k < temp[d].length();
k++) {
00194 temp[1-d][j +
k * statmask[i].length()] = mask;
00195 }
00196 }
else {
00197
for (
int k=0;
k<temp[d].length();
k++) {
00198
if (temp[d][
k].find(
'*')!=string::npos) {
00199
PLERROR(
"In PTester::build_ : elements of statnames cannot contain the '*' character");
00200 }
00201
string elem = mask;
00202 elem.replace(pos,1,temp[d][k]);
00203 temp[1-d][j +
k * statmask[i].length()] = elem;
00204 }
00205 }
00206 }
00207 d = 1-d;
00208 }
00209 statnames_processed = temp[d];
00210 }
00211 }
00212
00213
00214
void PTester::build()
00215 {
00216 inherited::build();
00217
build_();
00218 }
00219
00220
void PTester::run()
00221 {
00222 perform(
true);
00223 }
00224
00225
void PTester::setExperimentDirectory(
const string& the_expdir)
00226 {
00227
if(the_expdir==
"")
00228 expdir =
"";
00229
else
00230 {
00231
if(!
force_mkdir(the_expdir))
00232
PLERROR(
"In PTester::setExperimentDirectory Could not create experiment directory %s",the_expdir.c_str());
00233 expdir =
abspath(the_expdir);
00234 }
00235 }
00236
00237
Vec PTester::perform(
bool call_forget)
00238 {
00239
if(!learner)
00240
PLERROR(
"No learner specified for PTester.");
00241
if(!splitter)
00242
PLERROR(
"No splitter specified for PTester");
00243
00244
int nstats;
00245 nstats = statnames_processed.length();
00246
Vec global_result(nstats);
00247
00248 {
00249
00250
if(expdir!=
"")
00251 {
00252
00253
if(save_initial_tester)
00254
PLearn::save(
append_slash(expdir)+
"tester.psave", *
this);
00255 }
00256
00257 splitter->setDataSet(dataset);
00258
00259
int nsplits = splitter->nsplits();
00260
if(nsplits>1)
00261 call_forget =
true;
00262
00263 TVec<string> testcostnames = learner->getTestCostNames();
00264 TVec<string> traincostnames = learner->getTrainCostNames();
00265
00266
int nsets = splitter->nSetsPerSplit();
00267
00268
00269 TVec< PP<VecStatsCollector> > stcol(nsets);
00270
for(
int setnum=0; setnum<nsets; setnum++)
00271 {
00272
if (template_stats_collector)
00273 {
00274
CopiesMap copies;
00275 stcol[setnum] = template_stats_collector->deepCopy(copies);
00276 }
00277
else
00278 stcol[setnum] =
new VecStatsCollector();
00279
00280
if(setnum==0)
00281 stcol[setnum]->setFieldNames(traincostnames);
00282
else
00283 stcol[setnum]->setFieldNames(testcostnames);
00284
00285 stcol[setnum]->build();
00286 stcol[setnum]->forget();
00287 }
00288
00289 PP<VecStatsCollector> train_stats = stcol[0];
00290 learner->setTrainStatsCollector(train_stats);
00291
00292
00293 PP<VecStatsCollector> global_statscol;
00294
if (global_template_stats_collector)
00295 {
00296
CopiesMap copies;
00297 global_statscol = global_template_stats_collector->deepCopy(copies);
00298 global_statscol->build();
00299 global_statscol->forget();
00300 }
00301
else
00302 global_statscol =
new VecStatsCollector();
00303
00304
00305 TVec<StatSpec> statspecs(nstats);
00306
for(
int k=0;
k<nstats;
k++) {
00307 statspecs[
k].init(statnames_processed[k]);
00308 }
00309
00310
00311
int testcostsize = testcostnames.size();
00312
00313 VMat global_stats_vm;
00314 VMat split_stats_vm;
00315
if(expdir!=
"" && report_stats)
00316 {
00317
saveStringInFile(expdir+
"train_cost_names.txt",
join(traincostnames,
"\n")+
"\n");
00318
saveStringInFile(expdir+
"test_cost_names.txt",
join(testcostnames,
"\n")+
"\n");
00319
00320 global_stats_vm =
new FileVMatrix(expdir+
"global_stats.pmat", 1, nstats);
00321
for(
int k=0;
k<nstats;
k++)
00322 global_stats_vm->declareField(k,statspecs[k].statName());
00323 global_stats_vm->saveFieldInfos();
00324
00325 split_stats_vm =
new FileVMatrix(expdir+
"split_stats.pmat", 0, 1+nstats);
00326 split_stats_vm->declareField(0,
"splitnum");
00327
for(
int k=0;
k<nstats;
k++)
00328 split_stats_vm->declareField(k+1,statspecs[k].setname +
"." + statspecs[k].intstatname);
00329 split_stats_vm->saveFieldInfos();
00330 }
00331
00332
for(
int splitnum=0; splitnum<nsplits; splitnum++)
00333 {
00334
string splitdir;
00335
if(expdir!=
"")
00336 splitdir =
append_slash(
append_slash(expdir)+
"Split"+
tostring(splitnum));
00337
00338 TVec<VMat> dsets = splitter->getSplit(splitnum);
00339 VMat trainset = dsets[0];
00340
if(splitdir!=
"" && save_data_sets)
00341
PLearn::save(splitdir+
"training_set.psave",trainset);
00342
00343
if(splitdir!=
"" && train && provide_learner_expdir)
00344 learner->setExperimentDirectory(
append_slash(splitdir+
"LearnerExpdir"));
00345
00346 learner->setTrainingSet(trainset, call_forget && train);
00347
if(dsets.size()>1)
00348 learner->setValidationSet(dsets[1]);
00349
00350
int outputsize = learner->outputsize();
00351
00352
00353
if (train)
00354 {
00355
if(splitdir!=
"" && save_initial_learners)
00356
PLearn::save(splitdir+
"initial_learner.psave",learner);
00357
00358 train_stats->forget();
00359 learner->train();
00360 train_stats->finalize();
00361
if(splitdir !=
"" && save_stat_collectors)
00362
PLearn::save(splitdir+
"train_stats.psave",train_stats);
00363
if(splitdir !=
"" && save_learners)
00364
PLearn::save(splitdir+
"final_learner.psave",learner);
00365 }
00366
else
00367 learner->build();
00368
for(
int setnum=1; setnum<dsets.length(); setnum++)
00369 {
00370 VMat testset = dsets[setnum];
00371 PP<VecStatsCollector> test_stats = stcol[setnum];
00372
string setname =
"test"+
tostring(setnum);
00373
if(splitdir!=
"" && save_data_sets)
00374
PLearn::save(splitdir+setname+
"_set.psave",testset);
00375 VMat test_outputs;
00376 VMat test_costs;
00377
force_mkdir(splitdir);
00378
if(splitdir !=
"" && save_test_outputs)
00379 test_outputs =
new FileVMatrix(splitdir+setname+
"_outputs.pmat",0,outputsize);
00380
if(splitdir !=
"" && save_test_costs)
00381 test_costs =
new FileVMatrix(splitdir+setname+
"_costs.pmat",0,testcostsize);
00382
00383 test_stats->forget();
00384
if (testset->length()==0) {
00385
PLWARNING(
"PTester:: test set % is of length 0, costs will be set to -1",setname.c_str());
00386 }
00387 learner->test(testset, test_stats, test_outputs, test_costs);
00388 test_stats->finalize();
00389
if(splitdir !=
"" && save_stat_collectors)
00390
PLearn::save(splitdir+setname+
"_stats.psave",test_stats);
00391 }
00392
00393
Vec splitres(1+nstats);
00394 splitres[0] = splitnum;
00395
00396
for(
int k=0;
k<nstats;
k++)
00397 {
00398 StatSpec& sp = statspecs[
k];
00399
if (sp.setnum>=stcol.length())
00400 splitres[
k+1] =
MISSING_VALUE;
00401
00402
00403
else
00404 splitres[
k+1] = stcol[sp.setnum]->getStat(sp.intstatname);
00405 }
00406
00407
if(split_stats_vm) {
00408 split_stats_vm->appendRow(splitres);
00409 split_stats_vm->flush();
00410 }
00411
00412 global_statscol->update(splitres.subVec(1,nstats));
00413 }
00414
00415
00416 global_statscol->finalize();
00417
for(
int k=0;
k<nstats;
k++)
00418 global_result[
k] = global_statscol->getStats(k).getStat(statspecs[k].extstat);
00419
00420
00421
if(global_stats_vm)
00422 global_stats_vm->appendRow(global_result);
00423
00424 }
00425
00426
00427
for (
int i = 0; i < final_commands.length(); i++) {
00428 system(final_commands[i].
c_str());
00429 }
00430
00431
return global_result;
00432 }
00433
00434
00436
00438 TVec<string> PTester::getStatNames()
00439 {
00440
return statnames_processed;
00441 }
00442
00443
00444
00445 void StatSpec::init(
const string& statname)
00446 {
00447
parseStatname(statname);
00448 }
00449
00450
00451
00452
00453
00454
00455
00456
00457
00458
00459
00460
00461
00462
00463
00464
00465
00466
00467
00468
00469
00470
00471
00472
00473
00474
00475
00476
00477
00478
00479
00480
00481
00482
00483
00484
00485
00486
00487
00488
00489
00490
00491
00492
00493
00494
00495
00496
00497
00498
00499
void StatSpec::parseStatname(
const string& statname)
00500 {
00501
PIStringStream in(
removeblanks(statname));
00502
if(in.
smartReadUntilNext(
"[", extstat)==EOF)
00503
PLERROR(
"No opening bracket found in statname %s", statname.c_str());
00504
string token;
00505
int nextsep = in.
smartReadUntilNext(
".[",token);
00506
if(nextsep==EOF)
00507
PLERROR(
"Expected dataset.xxxSTATxxx after the opening bracket. Got %s", token.c_str());
00508
else if(nextsep==
'[')
00509 {
00510
PLWARNING(
"In StatSpec::parseStatname - You are still using the old statnames format, please use the new one!");
00511
00512 intstatname = token;
00513
if(in.
smartReadUntilNext(
".",setname)==EOF)
00514
PLERROR(
"Error while parsing statname: expected a dot");
00515
string costname;
00516
if(in.
smartReadUntilNext(
"]",costname)==EOF)
00517
PLERROR(
"Error while parsing statname: expected a closing bracket");
00518 intstatname = intstatname+
"["+costname+
"]";
00519 }
00520
else
00521 {
00522 setname = token;
00523
if(in.
smartReadUntilNext(
"]",intstatname)==EOF)
00524
PLERROR(
"Error while parsing statname: expected a closing bracket");
00525 }
00526
00527
if(
setname==
"train")
00528 setnum = 0;
00529
else if(
setname==
"test")
00530 setnum = 1;
00531
else if(
setname.substr(0,4)==
"test")
00532 {
00533 setnum =
toint(
setname.substr(4));
00534
if(setnum==0)
00535
PLERROR(
"In parseStatname: use the name train instead of test0.\n"
00536
"The first set of a split is the training set. The following are test sets named test1 test2 ...");
00537
if(setnum<=0)
00538
PLERROR(
"In parseStatname: parse error for %s",statname.c_str());
00539 }
00540
else
00541
PLERROR(
"In parseStatname: parse error for %s",statname.c_str());
00542 }
00543
00545
00547 void PTester::makeDeepCopyFromShallowCopy(map<const void*, void*>& copies) {
00548 inherited::makeDeepCopyFromShallowCopy(copies);
00549
deepCopyField(statnames_processed, copies);
00550
deepCopyField(
dataset, copies);
00551
deepCopyField(
final_commands, copies);
00552
deepCopyField(
global_template_stats_collector, copies);
00553
deepCopyField(
learner, copies);
00554
deepCopyField(
splitter, copies);
00555
deepCopyField(
statmask, copies);
00556
deepCopyField(
template_stats_collector, copies);
00557
deepCopyField(statnames, copies);
00558
00559
00560 }
00561
00562 }