Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

SequentialValidation.cc

Go to the documentation of this file.
00001 // -*- C++ -*- 00002 00003 // SequentialValidation.cc 00004 // 00005 // Copyright (C) 2003 Rejean Ducharme, Yoshua Bengio 00006 // Copyright (C) 2003 Pascal Vincent 00007 // 00008 // Redistribution and use in source and binary forms, with or without 00009 // modification, are permitted provided that the following conditions are met: 00010 // 00011 // 1. Redistributions of source code must retain the above copyright 00012 // notice, this list of conditions and the following disclaimer. 00013 // 00014 // 2. Redistributions in binary form must reproduce the above copyright 00015 // notice, this list of conditions and the following disclaimer in the 00016 // documentation and/or other materials provided with the distribution. 00017 // 00018 // 3. The name of the authors may not be used to endorse or promote 00019 // products derived from this software without specific prior written 00020 // permission. 00021 // 00022 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00023 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00024 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00025 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00026 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00027 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00028 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00029 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00030 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00031 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00032 // 00033 // This file is part of the PLearn library. For more information on the PLearn 00034 // library, go to the PLearn Web site at www.plearn.org 00035 00036 00037 00038 #include "SequentialValidation.h" 00039 #include <plearn/math/VecStatsCollector.h> 00040 //#include "AsciiVMatrix.h" 00041 #include <plearn/vmat/FileVMatrix.h> 00042 #include <plearn_learners/testers/PTester.h> // for using class StatSpec 00043 00044 namespace PLearn { 00045 using namespace std; 00046 00047 00048 PLEARN_IMPLEMENT_OBJECT( 00049 SequentialValidation, 00050 "The SequentialValidation class allows you to describe a typical " 00051 "sequential validation experiment that you wish to perform.", 00052 "NO HELP"); 00053 00054 SequentialValidation::SequentialValidation() 00055 : init_train_size(1), 00056 last_test_time(-1), 00057 expdir(""), 00058 report_stats(true), 00059 save_final_model(true), 00060 save_initial_model(false), 00061 save_initial_seqval(true), 00062 save_data_sets(false), 00063 save_test_outputs(false), 00064 save_test_costs(false), 00065 save_stat_collectors(false), 00066 provide_learner_expdir(true) 00067 {} 00068 00069 void SequentialValidation::build_() 00070 { 00071 if (expdir != "") 00072 { 00073 if(pathexists(expdir)) 00074 PLERROR("Directory (or file) %s already exists. First move it out of the way.", expdir.c_str()); 00075 if(!force_mkdir(expdir)) 00076 PLERROR("Could not create experiment directory %s", expdir.c_str()); 00077 } 00078 } 00079 00080 void SequentialValidation::build() 00081 { 00082 inherited::build(); 00083 build_(); 00084 } 00085 00086 void SequentialValidation::declareOptions(OptionList& ol) 00087 { 00088 declareOption(ol, "report_stats", &SequentialValidation::report_stats, 00089 OptionBase::buildoption, 00090 "If true, the computed global statistics specified in statnames will be saved in global_stats.pmat \n" 00091 "and the corresponding per-split statistics will be saved in split_stats.pmat \n" 00092 "For reference, all cost names (as given by the learner's getTrainCostNames() and getTestCostNames() ) \n" 00093 "will be reported in files train_cost_names.txt and test_cost_names.txt"); 00094 00095 declareOption(ol, "statnames", &SequentialValidation::statnames, 00096 OptionBase::buildoption, 00097 "A list of global statistics we are interested in.\n" 00098 "These are strings of the form S1[S2[dataset.cost_name]] where:\n" 00099 " - dataset is train or test1 or test2 ... (train being \n" 00100 " the first dataset in a split, test1 the second, ...) \n" 00101 " - cost_name is one of the training or test cost names (depending on dataset) understood \n" 00102 " by the underlying learner (see its getTrainCostNames and getTestCostNames methods) \n" 00103 " - S1 and S2 are a statistic, i.e. one of: E (expectation), V(variance), MIN, MAX, STDDEV, ... \n" 00104 " S2 is computed over the samples of a given dataset split. S1 is over the splits. \n"); 00105 00106 declareOption(ol, "timewise_statnames", &SequentialValidation::timewise_statnames, 00107 OptionBase::buildoption, 00108 "Statistics to be collected into a VecStatsCollector at each timestep."); 00109 00110 declareOption(ol, "expdir", &SequentialValidation::expdir, 00111 OptionBase::buildoption, 00112 "Path of this experiment's directory in which to save all experiment results (will be created if it does not already exist). \n"); 00113 00114 declareOption(ol, "learner", &SequentialValidation::learner, 00115 OptionBase::buildoption, 00116 "The SequentialLearner to train/test. \n"); 00117 00118 declareOption(ol, "dataset", &SequentialValidation::dataset, 00119 OptionBase::buildoption, 00120 "The dataset to use for training/testing. \n"); 00121 00122 declareOption(ol, "init_train_size", &SequentialValidation::init_train_size, 00123 OptionBase::buildoption, 00124 "Size of the first training set. \n"); 00125 00126 declareOption(ol, "last_test_time", &SequentialValidation::last_test_time, 00127 OptionBase::buildoption, 00128 "The last time-step to use for testing (Default = -1, i.e. use all data)"); 00129 00130 declareOption(ol, "save_final_model", &SequentialValidation::save_final_model, 00131 OptionBase::buildoption, 00132 "If true, the final model will be saved in model.psave \n"); 00133 00134 declareOption(ol, "save_initial_model", &SequentialValidation::save_initial_model, 00135 OptionBase::buildoption, 00136 "If true, the initial model will be saved in initial_model.psave. \n"); 00137 00138 declareOption(ol, "save_initial_seqval", &SequentialValidation::save_initial_seqval, 00139 OptionBase::buildoption, 00140 "If true, this SequentialValidation object will be saved in sequential_validation.psave. \n"); 00141 00142 declareOption(ol, "save_data_sets", &SequentialValidation::save_data_sets, 00143 OptionBase::buildoption, 00144 "If true, the data sets (train/test) for each split will be saved. \n"); 00145 00146 declareOption(ol, "save_test_outputs", &SequentialValidation::save_test_outputs, 00147 OptionBase::buildoption, 00148 "If true, the outputs of the tests will be saved in test_outputs.pmat \n"); 00149 00150 declareOption(ol, "save_test_costs", &SequentialValidation::save_test_costs, 00151 OptionBase::buildoption, 00152 "If true, the costs of the tests will be saved in test_costs.pmat \n"); 00153 00154 declareOption(ol, "save_stat_collectors", &SequentialValidation::save_stat_collectors, 00155 OptionBase::buildoption, 00156 "If true, stat collectors of each data sets (train/test) will be saved for each split. \n"); 00157 00158 declareOption(ol, "provide_learner_expdir", &SequentialValidation::provide_learner_expdir, 00159 OptionBase::buildoption, 00160 "If true, learning results from the learner will be saved. \n"); 00161 00162 inherited::declareOptions(ol); 00163 } 00164 00165 void SequentialValidation::run() 00166 { 00167 if (expdir=="") 00168 PLERROR("No expdir specified for SequentialValidation."); 00169 if (!learner) 00170 PLERROR("No learner specified for SequentialValidation."); 00171 00172 // This is to set inputsize() and targetsize() 00173 learner->setTrainingSet(dataset, false); 00174 00175 setExperimentDirectory( append_slash(expdir) ); 00176 00177 // Save this experiment description in the expdir (buildoptions only) 00178 if (save_initial_seqval) 00179 PLearn::save(expdir+"sequential_validation.psave", *this); 00180 00181 TVec<string> testcostnames = learner->getTestCostNames(); 00182 TVec<string> traincostnames = learner->getTrainCostNames(); 00183 00184 int outputsize = learner->outputsize(); 00185 int nstats = statnames.length(); 00186 int timewise_nstats = timewise_statnames.length(); 00187 00188 TVec< PP<VecStatsCollector> > stcol(2); // one for train and one for test 00189 00190 // stats for a train on one split 00191 PP<VecStatsCollector> train_stats = new VecStatsCollector(); 00192 train_stats->setFieldNames(traincostnames); 00193 learner->setTrainStatsCollector(train_stats); 00194 stcol[0] = train_stats; 00195 00196 // stats for a test on one split 00197 PP<VecStatsCollector> test_stats = new VecStatsCollector(); 00198 test_stats->setFieldNames(testcostnames); 00199 stcol[1] = test_stats; 00200 00201 // stats over all sequence 00202 PP<VecStatsCollector> sequence_stats = new VecStatsCollector(); 00203 00204 // Stat specs (overall) 00205 TVec<StatSpec> statspecs(nstats); 00206 for (int k=0; k<nstats; k++) 00207 statspecs[k].init(statnames[k]); 00208 00209 // timewise stats (may not be used) 00210 PP<VecStatsCollector> timewise_stats = new VecStatsCollector(); 00211 00212 // Stat specs (timewise) 00213 TVec<StatSpec> timewise_statspecs(timewise_nstats); 00214 for (int k=0; k<timewise_nstats; ++k) 00215 timewise_statspecs[k].init(timewise_statnames[k]); 00216 00217 VMat global_stats_vm; // vmat where to save global result stats specified in statnames 00218 VMat split_stats_vm; // vmat where to save per split result stats 00219 VMat timewise_stats_vm; // vmat where to save timewise statistics 00220 00221 // Create all VMatrix related to saving statistics 00222 if (report_stats) 00223 { 00224 saveStringInFile(expdir+"train_cost_names.txt", join(traincostnames,"\n")+"\n"); 00225 saveStringInFile(expdir+"test_cost_names.txt", join(testcostnames,"\n")+"\n"); 00226 00227 global_stats_vm = new FileVMatrix(expdir+"global_stats.pmat", 0, nstats); 00228 for(int k=0; k<nstats; k++) 00229 global_stats_vm->declareField(k,statspecs[k].statName()); 00230 global_stats_vm->saveFieldInfos(); 00231 00232 split_stats_vm = new FileVMatrix(expdir+"sequence_stats.pmat", 0, 00233 1+nstats); 00234 split_stats_vm->declareField(0,"splitnum"); 00235 for(int k=0; k<nstats; k++) 00236 split_stats_vm->declareField(k+1,statspecs[k].setname + "." + statspecs[k].intstatname); 00237 split_stats_vm->saveFieldInfos(); 00238 00239 if (timewise_nstats > 0) { 00240 timewise_stats_vm = new FileVMatrix(expdir+"timewise_stats.pmat", 0, 00241 timewise_nstats); 00242 for (int k=0; k<timewise_nstats; ++k) 00243 timewise_stats_vm->declareField(k, timewise_statspecs[k].statName()); 00244 timewise_stats_vm->saveFieldInfos(); 00245 } 00246 } 00247 00248 // some learner parameters 00249 learner->build(); 00250 00251 VMat test_outputs; 00252 VMat test_costs; 00253 if (save_test_outputs) 00254 test_outputs = new FileVMatrix(expdir+"/test_outputs.pmat",0,outputsize); 00255 if (save_test_costs) 00256 test_costs = new FileVMatrix(expdir+"/test_costs.pmat",0,testcostnames); 00257 00258 // Some further initializations 00259 int maxt = (last_test_time >= 0? last_test_time : dataset.length() - 1); 00260 int splitnum = 0; 00261 double weight; 00262 Vec input, target; 00263 Vec output(learner->outputsize()); 00264 Vec costs(learner->nTestCosts()); 00265 00266 for (int t=init_train_size; t <= maxt; t++, splitnum++) 00267 { 00268 #ifdef DEBUG 00269 cout << "SequentialValidation::run() -- sub_train.length = " << t << " et sub_test.length = " << t+horizon << endl; 00270 #endif 00271 VMat sub_train = dataset.subMatRows(0,t); // excludes t, last training pair is (t-2,t-1) 00272 VMat sub_test = dataset.subMatRows(0, t+1); 00273 VMat only_test = dataset.subMatRows(t, 1); 00274 00275 string splitdir = expdir+"train_t="+tostring(t)+"/"; 00276 if (save_data_sets || save_initial_model || save_stat_collectors || save_final_model) 00277 force_mkdir(splitdir); 00278 if (save_data_sets) 00279 PLearn::save(splitdir+"training_set.psave", sub_train); 00280 if (save_initial_model) 00281 PLearn::save(splitdir+"initial_learner.psave",learner); 00282 00283 // TRAIN 00284 train_stats->forget(); 00285 learner->setTrainingSet(sub_train, false); 00286 learner->train(); 00287 train_stats->finalize(); 00288 00289 if (save_stat_collectors) 00290 PLearn::save(splitdir+"train_stats.psave",train_stats); 00291 if (save_final_model) 00292 PLearn::save(splitdir+"final_learner.psave",learner); 00293 00294 // TEST: simply use computeOutputAndCosts for 1 observation in this 00295 // implementation 00296 dataset.getExample(t, input, target, weight); 00297 test_stats->forget(); 00298 learner->setTestSet(sub_test); // temporary hack 00299 learner->setCurrentTestTime(t); // temporary hack 00300 learner->computeOutputAndCosts(input, target, output, costs); 00301 test_stats->update(costs); 00302 test_stats->finalize(); 00303 00304 // Save what is required from the test run 00305 if (save_data_sets) 00306 PLearn::save(splitdir+"test_set.psave", sub_test); 00307 if (test_outputs) 00308 test_outputs->appendRow(output); 00309 if (test_costs) 00310 test_costs->appendRow(costs); 00311 if (save_stat_collectors) 00312 PLearn::save(splitdir+"test_stats.psave",test_stats); 00313 00314 Vec splitres(1+nstats); 00315 splitres[0] = splitnum; 00316 00317 // Compute statnames for this split only 00318 for(int k=0; k<nstats; k++) 00319 { 00320 StatSpec& sp = statspecs[k]; 00321 if (sp.setnum>=stcol.length()) 00322 PLERROR("SequentialValidation::run, trying to access a test set (test%d) beyond the last one (test%d)", 00323 sp.setnum, stcol.length()-1); 00324 splitres[k+1] = stcol[sp.setnum]->getStat(sp.intstatname); 00325 } 00326 00327 if (split_stats_vm) 00328 split_stats_vm->appendRow(splitres); 00329 00330 // Add to overall stats collector 00331 sequence_stats->update(splitres.subVec(1,nstats)); 00332 00333 // Now compute timewise statnames. First loop is on the inner 00334 // statistics; then update the stats collector; then loop on the outer 00335 // statistics 00336 if (timewise_stats_vm) { 00337 Vec timewise_res(timewise_nstats); 00338 for (int k=0; k<timewise_nstats; ++k) { 00339 StatSpec& sp = timewise_statspecs[k]; 00340 if (sp.setnum>=stcol.length()) 00341 PLERROR("SequentialValidation::run, trying to access a test set " 00342 "(test%d) beyond the last one (test%d)", 00343 sp.setnum, stcol.length()-1); 00344 timewise_res[k] = stcol[sp.setnum]->getStat(sp.intstatname); 00345 } 00346 timewise_stats->update(timewise_res); 00347 for (int k=0; k<timewise_nstats; ++k) 00348 timewise_res[k] = 00349 timewise_stats->getStats(k).getStat(timewise_statspecs[k].extstat); 00350 timewise_stats_vm->appendRow(timewise_res); 00351 } 00352 } 00353 00354 sequence_stats->finalize(); 00355 00356 Vec global_result(nstats); 00357 for (int k=0; k<nstats; k++) 00358 global_result[k] = sequence_stats->getStats(k).getStat(statspecs[k].extstat); 00359 00360 if (global_stats_vm) 00361 global_stats_vm->appendRow(global_result); 00362 00363 reportStats(global_result); 00364 } 00365 00366 void SequentialValidation::setExperimentDirectory(const string& _expdir) 00367 { 00368 expdir = _expdir; 00369 if(provide_learner_expdir) 00370 learner->setExperimentDirectory(append_slash(expdir)+"Model"); 00371 } 00372 00373 void SequentialValidation::reportStats(const Vec& global_result) 00374 { 00375 if (!report_stats) 00376 return; 00377 00378 saveAscii(expdir+"global_result.avec", global_result); 00379 // saveAscii(expdir+"predictions.amat", learner->predictions); 00380 // saveAscii(expdir+"errors.amat", learner->errors, learner->getTestCostNames()); 00381 } 00382 00383 } // end of namespace PLearn 00384

Generated on Tue Aug 17 16:05:31 2004 for PLearn by doxygen 1.3.7