Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

PExperiment.cc

Go to the documentation of this file.
00001 // -*- C++ -*- 00002 00003 // PTester.cc 00004 // 00005 // Copyright (C) 2002 Pascal Vincent, Frederic Morin 00006 // 00007 // Redistribution and use in source and binary forms, with or without 00008 // modification, are permitted provided that the following conditions are met: 00009 // 00010 // 1. Redistributions of source code must retain the above copyright 00011 // notice, this list of conditions and the following disclaimer. 00012 // 00013 // 2. Redistributions in binary form must reproduce the above copyright 00014 // notice, this list of conditions and the following disclaimer in the 00015 // documentation and/or other materials provided with the distribution. 00016 // 00017 // 3. The name of the authors may not be used to endorse or promote 00018 // products derived from this software without specific prior written 00019 // permission. 00020 // 00021 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00022 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00023 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00024 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00025 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00026 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00027 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00028 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00029 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00030 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00031 // 00032 // This file is part of the PLearn library. For more information on the PLearn 00033 // library, go to the PLearn Web site at www.plearn.org 00034 00035 /* ******************************************************* 00036 * $Id: PExperiment.cc,v 1.11 2004/07/21 16:30:57 chrish42 Exp $ 00037 ******************************************************* */ 00038 00040 #include <plearn_learners/testers/PTester.h> 00041 #include <plearn/io/pl_io.h> 00042 #include <plearn/math/VecStatsCollector.h> 00043 #include <plearn/vmat/AsciiVMatrix.h> 00044 #include <plearn/vmat/FileVMatrix.h> 00045 00046 namespace PLearn { 00047 using namespace std; 00048 00049 TVec<string> addprepostfix(const string& prefix, const TVec<string>& names, const string& postfix) 00050 { 00051 TVec<string> newnames(names.size()); 00052 TVec<string>::const_iterator it = names.begin(); 00053 TVec<string>::iterator newit = newnames.begin(); 00054 while(it!=names.end()) 00055 { 00056 *newit = prefix + *it + postfix; 00057 ++it; 00058 ++newit; 00059 } 00060 return newnames; 00061 } 00062 00063 template<class T> TVec<T> operator&(const T& x, const TVec<T>& v) 00064 { 00065 int l = v.size(); 00066 TVec<T> res(1+l); 00067 res[0] = x; 00068 res.subVec(1,l) << v; 00069 return res; 00070 } 00071 00072 PTester::PTester() 00073 : report_stats(true), 00074 save_initial_experiment(true), 00075 save_stat_collectors(true), 00076 save_learners(true), 00077 save_initial_learners(false), 00078 save_data_sets(false), 00079 save_test_outputs(false), 00080 save_test_costs(false), 00081 provide_learner_expdir(false) 00082 {} 00083 00084 PLEARN_IMPLEMENT_OBJECT(PTester, "Evaluates the performance of a PLearner", 00085 "The PTester class allows you to describe a typical learning experiment that you wish to perform, \n" 00086 "as a training/testing of a learning algorithm on a particular dataset.\n" 00087 "The splitter is used to obtain one or several (such as for k-fold) splits of the dataset \n" 00088 "and training/testing is performed on each split. \n" 00089 "Requested statistics are computed, and all requested results are written in an appropriate \n" 00090 "file inside the specified experiment directory. \n"); 00091 00092 void PTester::declareOptions(OptionList& ol) 00093 { 00094 declareOption(ol, "expdir", &PTester::expdir, OptionBase::buildoption, 00095 "Path of this experiment's directory in which to save all experiment results.\n" 00096 "The directory will be created if it does not already exist.\n" 00097 "If this is an empty string, no directory is created and no output file is generated.\n"); 00098 declareOption(ol, "learner", &PTester::learner, OptionBase::buildoption, 00099 "The learner to train/test.\n" 00100 "learner.train_set will be used as the dataset for this experiment\n" 00101 "(you may omit learner.train_set if your splitter is an ExplicitSplitter)"); 00102 declareOption(ol, "splitter", &PTester::splitter, OptionBase::buildoption, 00103 "The splitter to use to generate one or several train/test tuples from the dataset."); 00104 declareOption(ol, "statnames", &PTester::statnames, OptionBase::buildoption, 00105 "A list of global statistics we are interested in.\n" 00106 "These are strings of the form S1[S2[dataset.cost_name]] where:\n" 00107 " - dataset is train or test1 or test2 ... (train being \n" 00108 " the first dataset in a split, test1 the second, ...) \n" 00109 " - cost_name is one of the training or test cost names (depending on dataset) understood \n" 00110 " by the underlying learner (see its getTrainCostNames and getTestCostNames methods) \n" 00111 " - S1 and S2 are a statistic, i.e. one of: E (expectation), V(variance), MIN, MAX, STDDEV, ... \n" 00112 " S2 is computed over the samples of a given dataset split. S1 is over the splits. \n"); 00113 declareOption(ol, "report_stats", &PTester::report_stats, OptionBase::buildoption, 00114 "If true, the computed global statistics specified in statnames will be saved in global_stats.pmat \n" 00115 "and the corresponding per-split statistics will be saved in split_stats.pmat \n" 00116 "For reference, all cost names (as given by the learner's getTrainCostNames() and getTestCostNames() ) \n" 00117 "will be reported in files train_cost_names.txt and test_cost_names.txt"); 00118 declareOption(ol, "save_initial_experiment", &PTester::save_initial_experiment, OptionBase::buildoption, 00119 "If true, this PTester object will be saved in its initial state in experiment.psave \n" 00120 "Thus if the initial .plearn file gets lost, or modified, we can always see what this experiment was.\n"); 00121 declareOption(ol, "save_stat_collectors", &PTester::save_stat_collectors, OptionBase::buildoption, 00122 "If true, stat collectors for split#k will be saved in Split#k/train_stats.psave and Split#k/test#i_stats.psave"); 00123 declareOption(ol, "save_learners", &PTester::save_learners, OptionBase::buildoption, 00124 "If true, the final trained learner for split#k will be saved in Split#k/final_learner.psave"); 00125 declareOption(ol, "save_initial_learners", &PTester::save_initial_learners, OptionBase::buildoption, 00126 "If true, the initial untrained learner for split#k (just after forget() has been called) will be saved in Split#k/initial_learner.psave"); 00127 declareOption(ol, "save_data_sets", &PTester::save_data_sets, OptionBase::buildoption, 00128 "If true, the data set generated for split #k will be saved as Split#k/training_set.psave Split#k/test1_set.psave ..."); 00129 declareOption(ol, "save_test_outputs", &PTester::save_test_outputs, OptionBase::buildoption, 00130 "If true, the outputs of the test for split #k will be saved in Split#k/test#i_outputs.pmat"); 00131 declareOption(ol, "save_test_costs", &PTester::save_test_costs, OptionBase::buildoption, 00132 "If true, the costs of the test for split #k will be saved in Split#k/test#i_costs.pmat"); 00133 declareOption(ol, "provide_learner_expdir", &PTester::provide_learner_expdir, OptionBase::buildoption, 00134 "If true, each learner to be trained will have its experiment directory set to Split#k/LearnerExpdir/"); 00135 inherited::declareOptions(ol); 00136 } 00137 00138 void PTester::build_() 00139 { 00140 if(expdir!="") 00141 { 00142 if(pathexists(expdir)) 00143 PLERROR("Directory (or file) %s already exists. First move it out of the way.",expdir.c_str()); 00144 if(!force_mkdir(expdir)) 00145 PLERROR("In PTester Could not create experiment directory %s",expdir.c_str()); 00146 expdir = abspath(expdir); 00147 } 00148 } 00149 00150 // ### Nothing to add here, simply calls build_ 00151 void PTester::build() 00152 { 00153 inherited::build(); 00154 build_(); 00155 } 00156 00157 void PTester::run() 00158 { 00159 perform(false); 00160 } 00161 00162 00164 00165 class StatSpec 00166 { 00167 public: 00168 string extstat; 00169 string intstat; 00170 string setname; 00171 int setnum; 00172 string costname; 00173 int costindex; // index of cost in vector of train costs (if setnum==0) or test costs (if setnum==1) computed by the learner. 00174 00175 StatSpec() 00176 : setnum(-1), costindex(-1) 00177 {} 00178 00179 void init(const string& statname, PP<PLearner> learner); 00180 00181 string intStatName() 00182 { return intstat + "[" + setname + "." + costname + "]"; } 00183 00184 00185 string statName() 00186 { return extstat + "[" + intStatName() + "]"; } 00187 00188 00189 private: 00190 00192 void parseStatname(const string& statname); 00193 00194 }; 00195 00196 00197 void StatSpec::init(const string& statname, PP<PLearner> learner) 00198 { 00199 parseStatname(statname); 00200 if(setnum==0) 00201 costindex = learner->getTrainCostIndex(costname); 00202 else 00203 costindex = learner->getTestCostIndex(costname); 00204 } 00205 00206 void StatSpec::parseStatname(const string& statname) 00207 { 00208 vector<string> tokens = split(removeallblanks(statname), "[]"); 00209 string set_and_cost; 00210 00211 if(tokens.size()==2) 00212 { 00213 extstat = "E"; 00214 intstat = tokens[0]; 00215 set_and_cost = tokens[1]; 00216 } 00217 else if(tokens.size()==3) 00218 { 00219 extstat = tokens[0]; 00220 intstat = tokens[1]; 00221 set_and_cost = tokens[2]; 00222 } 00223 else 00224 PLERROR("In parse_statname: parse error for %s",statname.c_str()); 00225 00226 if(set_and_cost.length()<5) 00227 PLERROR("In parse_statname: parse error for %s",statname.c_str()); 00228 00229 split_on_first(set_and_cost,".", setname, costname); 00230 00231 if(setname=="train") 00232 setnum = 0; 00233 else if(setname=="test") 00234 setnum = 1; 00235 else if(setname.substr(0,4)=="test") 00236 { 00237 setnum = toint(setname.substr(4)); 00238 if(setnum==0) 00239 PLERROR("In parse_statname: use the name train instead of test0.\n" 00240 "The first set of a split is the training set. The following are test sets named test1 test2 ..."); 00241 if(setnum<=0) 00242 PLERROR("In parse_statname: parse error for %s",statname.c_str()); 00243 } 00244 else 00245 PLERROR("In parse_statname: parse error for %s",statname.c_str()); 00246 } 00247 00248 00249 void PTester::setExperimentDirectory(const string& the_expdir) 00250 { 00251 if(the_expdir=="") 00252 expdir = ""; 00253 else 00254 { 00255 if(!force_mkdir(the_expdir)) 00256 PLERROR("In PTester::setExperimentDirectory Could not create experiment directory %s",the_expdir.c_str()); 00257 expdir = abspath(the_expdir); 00258 } 00259 } 00260 00261 Vec PTester::perform(bool dont_set_training_set) 00262 { 00263 if(!learner) 00264 PLERROR("No learner specified for PTester."); 00265 if(!splitter) 00266 PLERROR("No splitter specified for PTester"); 00267 00268 // get initial data set. 00269 VMat dataset = learner->getTrainingSet(); 00270 00271 if(expdir!="") 00272 { 00273 // Save this experiment description in the expdir (buildoptions only) 00274 if(save_initial_experiment) 00275 PLearn::save(append_slash(expdir)+"experiment.psave", *this, OptionBase::buildoption); 00276 } 00277 00278 splitter->setDataSet(dataset); 00279 00280 int nsplits = splitter->nsplits(); 00281 TVec<string> testcostnames = learner->getTestCostNames(); 00282 TVec<string> traincostnames = learner->getTrainCostNames(); 00283 00284 int nsets = splitter->nSetsPerSplit(); 00285 int nstats = statnames.length(); 00286 00287 // Stats collectors for individual sets of a split: 00288 TVec< PP<VecStatsCollector> > stcol(nsets); 00289 for(int setnum=0; setnum<nsets; setnum++) 00290 stcol[setnum] = new VecStatsCollector(); 00291 PP<VecStatsCollector> train_stats = stcol[0]; 00292 learner->setTrainStatsCollector(train_stats); 00293 00294 // Global stats collector 00295 PP<VecStatsCollector> global_statscol = new VecStatsCollector(); 00296 00297 // Stat specs 00298 TVec<StatSpec> statspecs(nstats); 00299 for(int k=0; k<nstats; k++) 00300 statspecs[k].init(statnames[k],learner); 00301 00302 // int traincostsize = traincostnames.size(); 00303 int testcostsize = testcostnames.size(); 00304 int outputsize = learner->outputsize(); 00305 00306 VMat global_stats_vm; // the vmat in which to save global result stats specified in statnames 00307 VMat split_stats_vm; // the vmat in which to save per split result stats 00308 if(expdir!="" && report_stats) 00309 { 00310 saveStringInFile(expdir+slash+"train_cost_names.txt", join(traincostnames,"\n")+"\n"); 00311 saveStringInFile(expdir+slash+"test_cost_names.txt", join(testcostnames,"\n")+"\n"); 00312 00313 global_stats_vm = new FileVMatrix(expdir+slash+"global_stats.pmat", 1, nstats); 00314 for(int k=0; k<nstats; k++) 00315 global_stats_vm->declareField(k,statspecs[k].statName()); 00316 global_stats_vm->saveFieldInfos(); 00317 00318 split_stats_vm = new FileVMatrix(expdir+slash+"split_stats.pmat", nsplits, 1+nstats); 00319 split_stats_vm->declareField(0,"splitnum"); 00320 for(int k=0; k<nstats; k++) 00321 split_stats_vm->declareField(k+1,statspecs[k].intStatName()); 00322 split_stats_vm->saveFieldInfos(); 00323 } 00324 00325 for(int splitnum=0; splitnum<nsplits; splitnum++) 00326 { 00327 string splitdir; 00328 if(expdir!="") 00329 splitdir = append_slash(expdir)+"Split"+tostring(splitnum)+slash; 00330 00331 TVec<VMat> dsets = splitter->getSplit(splitnum); 00332 VMat trainset = dsets[0]; 00333 if(splitdir!="" && save_data_sets) 00334 PLearn::save(splitdir+"training_set.psave",trainset); 00335 00336 if(splitdir!="" && provide_learner_expdir) 00337 learner->setExperimentDirectory(splitdir+"LearnerExpdir"+slash); 00338 00339 if(!dont_set_training_set || nsplits>1) 00340 learner->setTrainingSet(trainset); // also calls forget... 00341 00342 if(splitdir!="" && save_initial_learners) 00343 PLearn::save(splitdir+"initial_learner.psave",learner); 00344 00345 train_stats->forget(); 00346 learner->train(); 00347 train_stats->finalize(); 00348 if(save_stat_collectors) 00349 PLearn::save(splitdir+"train_stats.psave",train_stats); 00350 if(save_learners) 00351 PLearn::save(splitdir+"final_learner.psave",learner); 00352 00353 for(int setnum=1; setnum<dsets.length(); setnum++) 00354 { 00355 VMat testset = dsets[setnum]; 00356 PP<VecStatsCollector> test_stats = stcol[setnum]; 00357 string setname = "test"+tostring(setnum); 00358 if(splitdir!="" && save_data_sets) 00359 PLearn::save(splitdir+setname+"_set.psave",testset); 00360 VMat test_outputs; 00361 VMat test_costs; 00362 if(save_test_outputs) 00363 test_outputs = new FileVMatrix(splitdir+setname+"_outputs.pmat",0,outputsize); 00364 if(save_test_costs) 00365 test_costs = new FileVMatrix(splitdir+setname+"_costs.pmat",0,testcostsize); 00366 00367 test_stats->forget(); 00368 learner->test(testset, test_stats, test_outputs, test_costs); 00369 test_stats->finalize(); 00370 if(save_stat_collectors) 00371 PLearn::save(splitdir+setname+"_stats.psave",test_stats); 00372 } 00373 00374 Vec splitres(1+nstats); 00375 splitres[0] = splitnum; 00376 00377 for(int k=0; k<nstats; k++) 00378 { 00379 StatSpec& sp = statspecs[k]; 00380 splitres[k+1] = stcol[sp.setnum]->getStats(sp.costindex).getStat(sp.intstat); 00381 } 00382 00383 if(split_stats_vm) 00384 split_stats_vm->appendRow(splitres); 00385 00386 global_statscol->update(splitres.subVec(1,nstats)); 00387 } 00388 00389 00390 Vec global_result(nstats); 00391 00392 global_statscol->finalize(); 00393 for(int k=0; k<nstats; k++) 00394 global_result[k] = global_statscol->getStats(k).getStat(statspecs[k].extstat); 00395 00396 if(global_stats_vm) 00397 global_stats_vm->appendRow(global_result); 00398 00399 return global_result; 00400 } 00401 00402 } // end of namespace PLearn

Generated on Tue Aug 17 16:01:25 2004 for PLearn by doxygen 1.3.7