00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00040
#include <plearn_learners/testers/PTester.h>
00041
#include <plearn/io/pl_io.h>
00042
#include <plearn/math/VecStatsCollector.h>
00043
#include <plearn/vmat/AsciiVMatrix.h>
00044
#include <plearn/vmat/FileVMatrix.h>
00045
00046
namespace PLearn {
00047
using namespace std;
00048
00049 TVec<string> addprepostfix(
const string& prefix,
const TVec<string>& names,
const string& postfix)
00050 {
00051
TVec<string> newnames(names.
size());
00052
TVec<string>::const_iterator it = names.
begin();
00053
TVec<string>::iterator newit = newnames.
begin();
00054
while(it!=names.
end())
00055 {
00056 *newit = prefix + *it + postfix;
00057 ++it;
00058 ++newit;
00059 }
00060
return newnames;
00061 }
00062
00063 template<
class T>
TVec<T> operator&(
const T& x,
const TVec<T>& v)
00064 {
00065
int l = v.
size();
00066
TVec<T> res(1+l);
00067 res[0] =
x;
00068 res.
subVec(1,l) << v;
00069
return res;
00070 }
00071
00072 PTester::PTester()
00073 : report_stats(true),
00074 save_initial_experiment(true),
00075 save_stat_collectors(true),
00076 save_learners(true),
00077 save_initial_learners(false),
00078 save_data_sets(false),
00079 save_test_outputs(false),
00080 save_test_costs(false),
00081 provide_learner_expdir(false)
00082 {}
00083
00084
PLEARN_IMPLEMENT_OBJECT(
PTester,
"Evaluates the performance of a PLearner",
00085
"The PTester class allows you to describe a typical learning experiment that you wish to perform, \n"
00086
"as a training/testing of a learning algorithm on a particular dataset.\n"
00087
"The splitter is used to obtain one or several (such as for k-fold) splits of the dataset \n"
00088
"and training/testing is performed on each split. \n"
00089
"Requested statistics are computed, and all requested results are written in an appropriate \n"
00090
"file inside the specified experiment directory. \n");
00091
00092 void PTester::declareOptions(
OptionList& ol)
00093 {
00094
declareOption(ol,
"expdir", &PTester::expdir, OptionBase::buildoption,
00095
"Path of this experiment's directory in which to save all experiment results.\n"
00096
"The directory will be created if it does not already exist.\n"
00097
"If this is an empty string, no directory is created and no output file is generated.\n");
00098
declareOption(ol,
"learner", &PTester::learner, OptionBase::buildoption,
00099
"The learner to train/test.\n"
00100
"learner.train_set will be used as the dataset for this experiment\n"
00101
"(you may omit learner.train_set if your splitter is an ExplicitSplitter)");
00102
declareOption(ol,
"splitter", &PTester::splitter, OptionBase::buildoption,
00103
"The splitter to use to generate one or several train/test tuples from the dataset.");
00104
declareOption(ol,
"statnames", &PTester::statnames, OptionBase::buildoption,
00105
"A list of global statistics we are interested in.\n"
00106
"These are strings of the form S1[S2[dataset.cost_name]] where:\n"
00107
" - dataset is train or test1 or test2 ... (train being \n"
00108
" the first dataset in a split, test1 the second, ...) \n"
00109
" - cost_name is one of the training or test cost names (depending on dataset) understood \n"
00110
" by the underlying learner (see its getTrainCostNames and getTestCostNames methods) \n"
00111
" - S1 and S2 are a statistic, i.e. one of: E (expectation), V(variance), MIN, MAX, STDDEV, ... \n"
00112
" S2 is computed over the samples of a given dataset split. S1 is over the splits. \n");
00113
declareOption(ol,
"report_stats", &PTester::report_stats, OptionBase::buildoption,
00114
"If true, the computed global statistics specified in statnames will be saved in global_stats.pmat \n"
00115
"and the corresponding per-split statistics will be saved in split_stats.pmat \n"
00116
"For reference, all cost names (as given by the learner's getTrainCostNames() and getTestCostNames() ) \n"
00117
"will be reported in files train_cost_names.txt and test_cost_names.txt");
00118
declareOption(ol,
"save_initial_experiment", &PTester::save_initial_experiment, OptionBase::buildoption,
00119
"If true, this PTester object will be saved in its initial state in experiment.psave \n"
00120
"Thus if the initial .plearn file gets lost, or modified, we can always see what this experiment was.\n");
00121
declareOption(ol,
"save_stat_collectors", &PTester::save_stat_collectors, OptionBase::buildoption,
00122
"If true, stat collectors for split#k will be saved in Split#k/train_stats.psave and Split#k/test#i_stats.psave");
00123
declareOption(ol,
"save_learners", &PTester::save_learners, OptionBase::buildoption,
00124
"If true, the final trained learner for split#k will be saved in Split#k/final_learner.psave");
00125
declareOption(ol,
"save_initial_learners", &PTester::save_initial_learners, OptionBase::buildoption,
00126
"If true, the initial untrained learner for split#k (just after forget() has been called) will be saved in Split#k/initial_learner.psave");
00127
declareOption(ol,
"save_data_sets", &PTester::save_data_sets, OptionBase::buildoption,
00128
"If true, the data set generated for split #k will be saved as Split#k/training_set.psave Split#k/test1_set.psave ...");
00129
declareOption(ol,
"save_test_outputs", &PTester::save_test_outputs, OptionBase::buildoption,
00130
"If true, the outputs of the test for split #k will be saved in Split#k/test#i_outputs.pmat");
00131
declareOption(ol,
"save_test_costs", &PTester::save_test_costs, OptionBase::buildoption,
00132
"If true, the costs of the test for split #k will be saved in Split#k/test#i_costs.pmat");
00133
declareOption(ol,
"provide_learner_expdir", &PTester::provide_learner_expdir, OptionBase::buildoption,
00134
"If true, each learner to be trained will have its experiment directory set to Split#k/LearnerExpdir/");
00135 inherited::declareOptions(ol);
00136 }
00137
00138 void PTester::build_()
00139 {
00140
if(
expdir!=
"")
00141 {
00142
if(
pathexists(
expdir))
00143
PLERROR(
"Directory (or file) %s already exists. First move it out of the way.",
expdir.c_str());
00144
if(!
force_mkdir(
expdir))
00145
PLERROR(
"In PTester Could not create experiment directory %s",
expdir.c_str());
00146
expdir =
abspath(
expdir);
00147 }
00148 }
00149
00150
00151 void PTester::build()
00152 {
00153 inherited::build();
00154
build_();
00155 }
00156
00157 void PTester::run()
00158 {
00159
perform(
false);
00160 }
00161
00162
00164
00165 class StatSpec
00166 {
00167
public:
00168
string extstat;
00169 string intstat;
00170
string setname;
00171
int setnum;
00172 string costname;
00173 int costindex;
00174
00175 StatSpec()
00176 :
setnum(-1),
costindex(-1)
00177 {}
00178
00179
void init(
const string& statname,
PP<PLearner> learner);
00180
00181 string intStatName()
00182 {
return intstat +
"[" +
setname +
"." +
costname +
"]"; }
00183
00184
00185 string statName()
00186 {
return extstat +
"[" +
intStatName() +
"]"; }
00187
00188
00189
private:
00190
00192
void parseStatname(
const string& statname);
00193
00194 };
00195
00196
00197 void StatSpec::init(
const string& statname,
PP<PLearner> learner)
00198 {
00199
parseStatname(statname);
00200
if(
setnum==0)
00201
costindex = learner->getTrainCostIndex(
costname);
00202
else
00203
costindex = learner->getTestCostIndex(
costname);
00204 }
00205
00206 void StatSpec::parseStatname(
const string& statname)
00207 {
00208
vector<string> tokens =
split(
removeallblanks(statname),
"[]");
00209
string set_and_cost;
00210
00211
if(tokens.size()==2)
00212 {
00213
extstat =
"E";
00214
intstat = tokens[0];
00215 set_and_cost = tokens[1];
00216 }
00217
else if(tokens.size()==3)
00218 {
00219
extstat = tokens[0];
00220
intstat = tokens[1];
00221 set_and_cost = tokens[2];
00222 }
00223
else
00224
PLERROR(
"In parse_statname: parse error for %s",statname.c_str());
00225
00226
if(set_and_cost.length()<5)
00227
PLERROR(
"In parse_statname: parse error for %s",statname.c_str());
00228
00229
split_on_first(set_and_cost,
".",
setname,
costname);
00230
00231
if(
setname==
"train")
00232
setnum = 0;
00233
else if(
setname==
"test")
00234
setnum = 1;
00235
else if(
setname.substr(0,4)==
"test")
00236 {
00237
setnum =
toint(
setname.substr(4));
00238
if(
setnum==0)
00239
PLERROR(
"In parse_statname: use the name train instead of test0.\n"
00240
"The first set of a split is the training set. The following are test sets named test1 test2 ...");
00241
if(
setnum<=0)
00242
PLERROR(
"In parse_statname: parse error for %s",statname.c_str());
00243 }
00244
else
00245
PLERROR(
"In parse_statname: parse error for %s",statname.c_str());
00246 }
00247
00248
00249 void PTester::setExperimentDirectory(
const string& the_expdir)
00250 {
00251
if(the_expdir==
"")
00252
expdir =
"";
00253
else
00254 {
00255
if(!
force_mkdir(the_expdir))
00256
PLERROR(
"In PTester::setExperimentDirectory Could not create experiment directory %s",the_expdir.c_str());
00257
expdir =
abspath(the_expdir);
00258 }
00259 }
00260
00261 Vec PTester::perform(
bool dont_set_training_set)
00262 {
00263
if(!
learner)
00264
PLERROR(
"No learner specified for PTester.");
00265
if(!
splitter)
00266
PLERROR(
"No splitter specified for PTester");
00267
00268
00269
VMat dataset =
learner->getTrainingSet();
00270
00271
if(
expdir!=
"")
00272 {
00273
00274
if(
save_initial_experiment)
00275
PLearn::save(
append_slash(
expdir)+
"experiment.psave", *
this, OptionBase::buildoption);
00276 }
00277
00278
splitter->setDataSet(dataset);
00279
00280
int nsplits =
splitter->nsplits();
00281
TVec<string> testcostnames =
learner->getTestCostNames();
00282
TVec<string> traincostnames =
learner->getTrainCostNames();
00283
00284
int nsets =
splitter->nSetsPerSplit();
00285
int nstats =
statnames.
length();
00286
00287
00288
TVec< PP<VecStatsCollector> > stcol(nsets);
00289
for(
int setnum=0; setnum<nsets; setnum++)
00290 stcol[setnum] =
new VecStatsCollector();
00291
PP<VecStatsCollector> train_stats = stcol[0];
00292
learner->setTrainStatsCollector(train_stats);
00293
00294
00295
PP<VecStatsCollector> global_statscol =
new VecStatsCollector();
00296
00297
00298
TVec<StatSpec> statspecs(nstats);
00299
for(
int k=0;
k<nstats;
k++)
00300 statspecs[
k].init(
statnames[
k],
learner);
00301
00302
00303
int testcostsize = testcostnames.
size();
00304
int outputsize =
learner->outputsize();
00305
00306
VMat global_stats_vm;
00307
VMat split_stats_vm;
00308
if(
expdir!=
"" &&
report_stats)
00309 {
00310
saveStringInFile(
expdir+
slash+
"train_cost_names.txt",
join(traincostnames,
"\n")+
"\n");
00311
saveStringInFile(
expdir+
slash+
"test_cost_names.txt",
join(testcostnames,
"\n")+
"\n");
00312
00313 global_stats_vm =
new FileVMatrix(
expdir+
slash+
"global_stats.pmat", 1, nstats);
00314
for(
int k=0;
k<nstats;
k++)
00315 global_stats_vm->declareField(
k,statspecs[
k].statName());
00316 global_stats_vm->saveFieldInfos();
00317
00318 split_stats_vm =
new FileVMatrix(
expdir+
slash+
"split_stats.pmat", nsplits, 1+nstats);
00319 split_stats_vm->declareField(0,
"splitnum");
00320
for(
int k=0;
k<nstats;
k++)
00321 split_stats_vm->declareField(
k+1,statspecs[
k].intStatName());
00322 split_stats_vm->saveFieldInfos();
00323 }
00324
00325
for(
int splitnum=0; splitnum<nsplits; splitnum++)
00326 {
00327
string splitdir;
00328
if(
expdir!=
"")
00329 splitdir =
append_slash(
expdir)+
"Split"+
tostring(splitnum)+
slash;
00330
00331
TVec<VMat> dsets =
splitter->getSplit(splitnum);
00332
VMat trainset = dsets[0];
00333
if(splitdir!=
"" &&
save_data_sets)
00334
PLearn::save(splitdir+
"training_set.psave",trainset);
00335
00336
if(splitdir!=
"" &&
provide_learner_expdir)
00337
learner->setExperimentDirectory(splitdir+
"LearnerExpdir"+
slash);
00338
00339
if(!dont_set_training_set || nsplits>1)
00340
learner->setTrainingSet(trainset);
00341
00342
if(splitdir!=
"" &&
save_initial_learners)
00343
PLearn::save(splitdir+
"initial_learner.psave",
learner);
00344
00345 train_stats->forget();
00346
learner->train();
00347 train_stats->finalize();
00348
if(
save_stat_collectors)
00349
PLearn::save(splitdir+
"train_stats.psave",train_stats);
00350
if(
save_learners)
00351
PLearn::save(splitdir+
"final_learner.psave",
learner);
00352
00353
for(
int setnum=1; setnum<dsets.
length(); setnum++)
00354 {
00355
VMat testset = dsets[setnum];
00356
PP<VecStatsCollector> test_stats = stcol[setnum];
00357
string setname =
"test"+
tostring(setnum);
00358
if(splitdir!=
"" &&
save_data_sets)
00359
PLearn::save(splitdir+setname+
"_set.psave",testset);
00360
VMat test_outputs;
00361
VMat test_costs;
00362
if(
save_test_outputs)
00363 test_outputs =
new FileVMatrix(splitdir+setname+
"_outputs.pmat",0,outputsize);
00364
if(
save_test_costs)
00365 test_costs =
new FileVMatrix(splitdir+setname+
"_costs.pmat",0,testcostsize);
00366
00367 test_stats->forget();
00368
learner->test(testset, test_stats, test_outputs, test_costs);
00369 test_stats->finalize();
00370
if(
save_stat_collectors)
00371
PLearn::save(splitdir+setname+
"_stats.psave",test_stats);
00372 }
00373
00374
Vec splitres(1+nstats);
00375 splitres[0] = splitnum;
00376
00377
for(
int k=0;
k<nstats;
k++)
00378 {
00379
StatSpec& sp = statspecs[
k];
00380 splitres[
k+1] = stcol[sp.
setnum]->getStats(sp.
costindex).getStat(sp.
intstat);
00381 }
00382
00383
if(split_stats_vm)
00384 split_stats_vm->appendRow(splitres);
00385
00386 global_statscol->update(splitres.
subVec(1,nstats));
00387 }
00388
00389
00390
Vec global_result(nstats);
00391
00392 global_statscol->finalize();
00393
for(
int k=0;
k<nstats;
k++)
00394 global_result[
k] = global_statscol->getStats(
k).getStat(statspecs[
k].extstat);
00395
00396
if(global_stats_vm)
00397 global_stats_vm->appendRow(global_result);
00398
00399
return global_result;
00400 }
00401
00402 }