00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
#include "old_plearn_main.h"
00042
00043
#include <plearn/io/MatIO.h>
00044
#include <plearn/io/fileutils.h>
00045
#include <plearn/db/getDataSet.h>
00046
#include <plearn/math/random.h>
00047
#include <plearn_learners/generic/Learner.h>
00048
#include <plearn/opt/Optimizer.h>
00049
#include <plearn/ker/Kernel.h>
00050
#include <plearn_learners/misc/Experiment.h>
00051
#include <plearn/vmat/FileVMatrix.h>
00052
#include <plearn/ker/SquaredErrorCostFunction.h>
00053
#include <plearn/sys/PLMPI.h>
00054
00055
namespace PLearn {
00056
using namespace std;
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00072 map<string, string>
getModelAliases(
const string& filename)
00073 {
00074 map<string, string> aliases;
00075 ifstream in(filename.c_str());
00076
if(!in)
00077
PLERROR(
"In getModelAliases: could not open file %s", filename.c_str());
00078
while(in)
00079 {
00080
string alias;
00081 getline(in,alias,
'=');
00082 alias =
removeblanks(alias);
00083
if(alias.length()==0)
00084
break;
00085
if(alias.find_first_of(
" \t\n\r")!=string::npos)
00086
PLERROR(
"In getModelAliases: problem, expecting a single word alias followed by an equal (=) sign; read %s",alias.c_str());
00087
00088 in >>
ws;
00089
string definition;
00090
smartReadUntilNext(in,
";", definition);
00091
remove_comments(definition);
00092 aliases.insert(make_pair(alias,
removeblanks(definition)));
00093 }
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
for(map<string, string>::iterator it= aliases.begin(); it != aliases.end(); ++it)
00104 {
00105
unsigned int pos= 0;
00106
while(string::npos != (pos= it->second.find(
'$', pos)))
00107 {
00108
const string delimiters=
";]";
00109
unsigned int n= string::npos;
00110
for(
unsigned int i= 0; i < delimiters.length(); ++i)
00111 {
00112
unsigned int n0= it->second.find(delimiters[i], pos);
00113
if(n0 < n)
00114 n= n0;
00115 }
00116 n-= pos;
00117
string alias=
removeblanks(it->second.substr(pos+1, n-1));
00118
if(aliases.find(alias) == aliases.end())
00119
PLERROR(
"In getModelAliases: alias %s is referenced but not defined.", alias.c_str());
00120 it->second.replace(pos, n, aliases[alias]);
00121 }
00122 }
00123
00124
return aliases;
00125 }
00126
00127 void train_and_test(
const string& modelalias,
string trainalias,
vector<string> testaliases)
00128 {
00129 map<string,string> dataset_aliases =
getDatasetAliases(
".");
00130
if(dataset_aliases.empty())
00131
exitmsg(
"Problem: No dataset.aliases found in the current directory or its parents");
00132
if(dataset_aliases.find(trainalias)==dataset_aliases.end())
00133
exitmsg(
"Problem: No alias '%s' found in dataset.aliases",trainalias.c_str());
00134
string trainsetdef = dataset_aliases[trainalias];
00135 cout <<
">> Will be training on alias '" << trainalias <<
"': " << trainsetdef <<
endl;
00136
VMat trainset =
getDataSet(trainsetdef,trainalias);
00137 cout <<
" size: " << trainset.
length() <<
" x " << trainset.
width() <<
endl;
00138
00139
int ntestsets = testaliases.size();
00140
Array<VMat> testsets(ntestsets);
00141
for(
int i=0; i<ntestsets; i++)
00142 {
00143
string alias = testaliases[i];
00144
if(dataset_aliases.find(alias)==dataset_aliases.end())
00145
exitmsg(
"Problem: No alias for '%s' found in dataset.aliases",alias.c_str());
00146
string testsetdef = dataset_aliases[testaliases[i]];
00147 cout <<
">> Will be testing on alias '" << alias <<
"': " << testsetdef <<
endl;
00148 testsets[i] =
getDataSet(testsetdef, alias);
00149 cout <<
" size: " << testsets[i].
length() <<
" x " << testsets[i].width() <<
endl;
00150 }
00151
00152
if(!
isfile(
"model.aliases"))
00153
exitmsg(
"Problem: No model.aliases file in current directory");
00154 map<string, string> model_aliases =
getModelAliases(
"model.aliases");
00155
if(model_aliases.find(modelalias)==model_aliases.end())
00156
exitmsg(
"Problem: Could not find alias %s in file model.aliases",modelalias.c_str());
00157
00158
string use_saved_model =
"";
00159
if(
isdir(modelalias))
00160 {
00161
vector<string> dirlist =
lsdir(modelalias);
00162
vector<string>::iterator it = dirlist.begin();
00163
vector<string>::iterator itend = dirlist.end();
00164
int maxmodelnum = -1;
00165
for(; it!=itend; ++it)
00166 {
00167
int itl = it->length();
00168
if(*it ==
"model.psave")
00169 {
00170 use_saved_model = modelalias +
"/" + *it;
00171
break;
00172 }
00173
else if(itl>11 && it->substr(0,5)==
"model" && it->substr(itl-6,6)==
".psave")
00174 {
00175
int modelnum =
toint(it->substr(5,itl-11));
00176
if(modelnum>maxmodelnum)
00177 {
00178 modelnum = maxmodelnum;
00179 use_saved_model = modelalias +
"/" + *it;
00180 }
00181 }
00182 }
00183 }
00184
00185
PP<Learner> learner;
00186
if(use_saved_model!=
"")
00187 {
00188 cout <<
">> Loading saved learner from file " << use_saved_model <<
endl;
00189 learner = dynamic_cast<Learner*>(
loadObject(use_saved_model));
00190
if(!learner)
00191
exitmsg(
"Problem in making file %s into a Learner",use_saved_model.c_str());
00192 }
00193
else
00194 {
00195
string modelspec = model_aliases[modelalias];
00196 cout <<
">> Creating learner: " << modelspec <<
endl;
00197
PLearn::read(modelspec, learner);
00198
00199 }
00200
00201
00202
00203 cout <<
">> Learner has inputsize=" << learner->inputsize() <<
" targetsize=" << learner->targetsize() <<
" outputsize=" << learner->outputsize() <<
endl;
00204
00205
00206
00207 learner->setExperimentDirectory(modelalias);
00208 learner->setTestDuringTrain(testsets);
00209
00210 cout <<
"Training and testing..." <<
endl;
00211 learner->train(trainset);
00212
00213
string psavefile = learner->basename()+
".psave";
00214 cout <<
">>> Saving final trained model in file: " << psavefile <<
endl;
00215 cerr <<
"{Temporarily commented out by Pascal: don't want to save the Object.\n"
00216 <<
" Also with the current 3 argument version, this systematically calls newsave,\n"
00217 <<
" so older objects which don't yet have a functional option system cannot be saved through this: to be fixed!!!\n";
00218
00219
#if 0
00220
00221
00222
string targetfile = learner->basename()+
".targets.pmat";
00223
string outputfile = learner->basename()+
"."+datasetalias+
".outputs.pmat";
00224
string costfile = learner->basename()+
"."+datasetalias+
".costs.pmat";
00225
VMat vm = testsets[ntestsets-1];
00226
int l = vm.
length();
00227
VMat outputmat =
new FileVMatrix(outputfile,l,learner->outputsize());
00228
VMat costmat =
new FileVMatrix(costfile,l,learner->costsize());
00229
VMat targetmat =
new FileVMatrix(targetfile,l,learner->targetsize());
00230
Vec input_and_target(vm.
width());
00231
Vec input = input_and_target.
subVec(0,learner->inputsize());
00232
Vec target = input_and_target.
subVec(learner->inputsize(), learner->targetsize());
00233
Vec output(learner->outputsize());
00234
Vec cost(learner->costsize());
00235
Vec costs(learner->costsize(), 0.0);
00236 {
00237
ProgressBar pbar(cout,
"Computing output and cost",l);
00238
for(
int i=0; i<l; i++)
00239 {
00240 vm->getRow(i,input_and_target);
00241 learner->useAndCost(input, target, output, cost);
00242 targetmat->putRow(i,target);
00243 outputmat->putRow(i,output);
00244 costmat->putRow(i,cost);
00245 costs+= cost;
00246 pbar(i);
00247 }
00248
00249 }
00250
00251 cout << learner->costNames() <<
endl
00252 << costs/l <<
endl;
00253
00254
#endif
00255
00256
save(psavefile, *learner);
00257
00258 }
00259
00260 vector<string> getMultipleModelAliases(
const string& model)
00261 {
00262
vector<string> result;
00263
if(model[model.length()-1]!=
'*')
00264 {
00265 result.push_back(model);
00266
return result;
00267 }
00268
string modelprefix=model.substr(0,model.length()-1);
00269
if(!
isfile(
"model.aliases"))
00270
exitmsg(
"Problem: No model.aliases file in current directory");
00271 map<string, string> model_aliases =
getModelAliases(
"model.aliases");
00272
for(map<string,string>::iterator it=model_aliases.begin();it!=model_aliases.end();it++)
00273
if(modelprefix==
"" || it->first.find(modelprefix)==0)
00274 result.push_back(it->first);
00275
return result;
00276 }
00277
00278
00279 void cross_valid(
const string& modelalias,
string trainalias,
int kval)
00280 {
00281 map<string,string> dataset_aliases =
getDatasetAliases(
".");
00282
if(dataset_aliases.empty())
00283
exitmsg(
"Problem: No dataset.aliases found in the current directory or its parents");
00284
if(dataset_aliases.find(trainalias)==dataset_aliases.end())
00285
exitmsg(
"Problem: No alias '%s' found in dataset.aliases",trainalias.c_str());
00286
string trainsetdef = dataset_aliases[trainalias];
00287 cout <<
">> Will be crossvalidating with a kfold value of "<<kval<<
" on alias '" << trainalias <<
"': " << trainsetdef <<
endl;
00288
VMat trainset =
getDataSet(trainsetdef,trainalias);
00289 cout <<
" size of whole dataset: " << trainset.
length() <<
" x " << trainset.
width() <<
endl;
00290
00291
if(!
isfile(
"model.aliases"))
00292
exitmsg(
"Problem: No model.aliases file in current directory");
00293 map<string, string> model_aliases =
getModelAliases(
"model.aliases");
00294
if(model_aliases.find(modelalias)==model_aliases.end())
00295
exitmsg(
"Problem: Could not find alias %s in file model.aliases",modelalias.c_str());
00296
00297
00298
00299
00300
00301
00302
00303
00304
00305
00306
00307
00308
00309
00310
00311
00312
00313
00314
00315
00316
00317
00318
00319
00320
00321
00322
00323
00324
00325
00326
PP<Learner> learner;
00327
00328
00329
00330
00331
00332
00333
00334
00335 {
00336
string modelspec = model_aliases[modelalias];
00337 cout <<
">> Creating learner: " << modelspec <<
endl;
00338
PLearn::read(modelspec, learner);
00339
00340 }
00341
00342
00343
00344 cout <<
">> Learner has inputsize=" << learner->inputsize() <<
" targetsize=" << learner->targetsize() <<
" outputsize=" << learner->outputsize() <<
endl;
00345
00346
if(trainset.
width()!=learner->inputsize()+learner->targetsize())
00347
exitmsg(
"Problem: learner's inputsize+targetsize differs from the width of the trainingset!!!");
00348
00349 learner->setExperimentDirectory(modelalias);
00350
00351
Mat mglobal(0,0);
00352
Mat mhist(0,0);
00353
TVec<std::string> fnames;
00354
00355
for(
int i=0;i<kval;i++)
00356 {
00357
VMat train_k,test_k;
00358
split(trainset, 1.0f/kval, train_k, test_k, kval-i-1);
00359 train_k->setAlias(trainset->getAlias()+
"_kf"+
tostring(kval)+
"_"+
tostring(i));
00360 test_k->setAlias(trainset->getAlias()+
"_kf"+
tostring(kval)+
"_-"+
tostring(i));
00361
00362 learner->forget();
00363 learner->setTestDuringTrain(test_k);
00364
00365 cout <<
"Training and testing ... train.length="<<train_k.
length()<<
" test.length="<<test_k.
length()<<
" step:" << i+1 <<
" / "<<kval<<
endl;
00366 learner->train(train_k);
00367
00368
string psavefile = learner->basename()+
".psave";
00369 cout <<
">>> Saving final trained model in file: " << psavefile <<
endl;
00370
save(psavefile, *learner);
00371
00372
00373
00374
Mat mmhist;
00375
loadAscii(learner->basename()+
"."+test_k->getAlias()+
".hist.results",mmhist,fnames);
00376
if(mhist.
width()!=mmhist.
width() || mhist.
length()!=mmhist.
length())
00377 {
00378
if(mhist.
width()!=0)
00379
PLWARNING(
"While merging results file in hist.results: differents parts of the kfold don't have the same number of epochs (are you using early stopping?)");
00380 mhist.
resize(mmhist.
length(),mmhist.
width());
00381 }
00382 mhist+=mmhist;
00383 }
00384
00385 mhist/=kval;
00386
Vec best(mhist.
width(),FLT_MAX);
00387
00388
00389
00390
for(
int i=0;i<mhist.
length();i++)
00391
if(mhist[i][2]<best[2])
00392 best=mhist(i);
00393 ofstream out((learner->getExperimentDirectory()+trainset->getAlias()+
".results").c_str());
00394
string fields;
00395
for(
int i=0;i<fnames.
size();i++)
00396 fields+=fnames[i]+=
" ";
00397 out<<
"#: "<<fields<<
endl;
00398 out<<best<<
endl;
00400
00401 ofstream out2((learner->getExperimentDirectory()+trainset->getAlias()+
".hist.results").c_str());
00402 out2<<
"#: "<<fields<<
endl;
00403 out2<<mhist<<
endl;
00404 }
00405
00406
00407 void use(
const string& modelfile,
const string& datasetalias)
00408 {
00409 map<string,string> aliases =
getDatasetAliases(modelfile);
00410
if(aliases.empty())
00411
exitmsg(
"Problem: could not locate a meaningful dataset.aliases file in this or parent directories");
00412
if(aliases.find(datasetalias)==aliases.end())
00413
exitmsg(
"Problem: no %s in dataset.aliases file",datasetalias.c_str());
00414
string dataset = aliases[datasetalias];
00415
VMat vm =
getDataSet(dataset);
00416 cout <<
">> Dataset has " << vm.
length() <<
" rows and " << vm.
width() <<
" columns" <<
endl;
00417
PP<Learner> learner = dynamic_cast<Learner*>(
loadObject(modelfile));
00418
if(!learner)
00419
exitmsg(
"Problem in making file %s into a Learner",modelfile.c_str());
00420
00421
if(learner->costsize() < 1)
00422 learner->setTestCostFunctions(
Array<Ker>(
new SquaredErrorCostFunction()));
00423
00424 cout <<
">> Learner has inputsize=" << learner->inputsize() <<
" targetsize=" << learner->targetsize() <<
" outputsize=" << learner->outputsize() <<
endl;
00425
00426
00427
string targetfile = datasetalias+
".targets.pmat";
00428
string outputfile =
remove_extension(modelfile)+
"."+datasetalias+
".outputs.pmat";
00429
string costfile =
remove_extension(modelfile)+
"."+datasetalias+
".costs.pmat";
00430
int l = vm.
length();
00431
VMat outputmat =
new FileVMatrix(outputfile,l,learner->outputsize());
00432
VMat costmat =
new FileVMatrix(costfile,l,learner->costsize());
00433
VMat targetmat =
new FileVMatrix(targetfile,l,learner->targetsize());
00434
Vec input_and_target(vm.
width());
00435
Vec input = input_and_target.
subVec(0,learner->inputsize());
00436
Vec target = input_and_target.
subVec(learner->inputsize(), learner->targetsize());
00437
Vec output(learner->outputsize());
00438
Vec cost(learner->costsize());
00439
Vec costs(learner->costsize(), 0.0);
00440 {
00441
ProgressBar pbar(cout,
"Computing output and cost",l);
00442
for(
int i=0; i<l; i++)
00443 {
00444 vm->getRow(i,input_and_target);
00445 learner->useAndCost(input, target, output, cost);
00446 targetmat->putRow(i,target);
00447 outputmat->putRow(i,output);
00448 costmat->putRow(i,cost);
00449 costs+= cost;
00450 pbar(i);
00451 }
00452
00453 }
00454
00455 cout << learner->costNames() <<
endl
00456 << costs/l <<
endl;
00457
00458 }
00459
00460 void usage()
00461 {
00462 cerr <<
"Usage: " <<
endl
00463 <<
" * plearn train <modelalias> <trainsetalias> [<testsetalias> <testsetalias> ...]\n"
00464 <<
" Will look for the corresponding alias in the 'model.aliases' file in the current directory \n"
00465 <<
" as well as for the specified dataset aliases in a 'dataset.aliases' file in the current or parent direcotries \n"
00466 <<
" It will then build the specified learner with the specified learneroptions, \n"
00467 <<
" train it on the specified train set, and save results (including test results \n"
00468 <<
" on specified testsets) in <modelalias> directory. \n"
00469 <<
" NOTE: you can train multiple models if you append ('*') to a model alias prefix.\n"
00470 <<
" Dont forget the quotes when you use the wildcard to prevent shelle expansion!\n"
00471 <<
" e.g: 'plearn train 'linear*' train valid'.\n"
00472 <<
" * plearn cross kfoldval <modelalias> <trainsetalias>\n"
00473 <<
" As with train, but will perform a crossvalidation training with ?? Pascal, complete ca stp:)\n"
00474 <<
" * plearn use <model#.psave> <datasetalias>\n"
00475 <<
" After locating the appropriate dataset.aliases looking in parent directories, \n"
00476 <<
" will apply the saved model to the specified dataset, and compute and create \n"
00477 <<
" <model#>.<datasetalias>.outputs.pmat and <model#>.<datasetalias>.costs.pmat \n"
00478 <<
" * plearn listmodels <model> \n"
00479 <<
" list the model aliases in the model.aliases file\n"
00480 <<
" model can optionnaly contain a wildcard '*'\n"
00481
00482
00483
00484
00485
00486
00487
00488
00489 <<
" * plearn help datasets \n"
00490 <<
" Will display info about the dataset specification strings you can use to define \n"
00491 <<
" aliases in the dataset.aliases file \n"
00492 <<
" * plearn help Learner \n"
00493 <<
" Will print a list of available learners\n"
00494 <<
" * plearn help Optimizer \n"
00495 <<
" Will print a list of available optimizers\n"
00496 <<
" * plearn help <object-type> \n"
00497 <<
" Will display help (mostly about available options) for that object-type\n"
00498 <<
endl;
00499 exit(0);
00500 }
00501
00502 int old_plearn_main(
int argc,
char** argv)
00503 {
00504 PLMPI::init(&argc, &argv);
00505
00506
seed();
00507
00508
if(argc<2)
00509
usage();
00510
00511
string command = argv[1];
00512
00513
if(command==
"train")
00514 {
00515
vector<string> modelaliases =
getMultipleModelAliases(argv[2]);
00516
string trainalias = argv[3];
00517
vector<string> testaliases =
stringvector(argc-4, argv+4);
00518
00519
for(
unsigned int i=0;i<modelaliases.size();i++)
00520 {
00521 cout<<
"*** Doing job for alias : "<< modelaliases[i]<<
endl;
00522
train_and_test(modelaliases[i], trainalias, testaliases);
00523 }
00524 }
00525
else if(command==
"cross")
00526 {
00527
if(argc<4)
00528
usage();
00529
int kval=
toint(argv[2]);
00530
vector<string> modelaliases =
getMultipleModelAliases(argv[3]);
00531
string trainalias = argv[4];
00532
for(
unsigned int i=0;i<modelaliases.size();i++)
00533 {
00534 cout<<
"*** Doing job for alias : "<< modelaliases[i]<<
endl;
00535
00536
cross_valid(modelaliases[i], trainalias, kval);
00537 }
00538 }
00539
else if(command==
"use")
00540 {
00541
vector<string> modelaliases =
getMultipleModelAliases(argv[2]);
00542
string datasetalias = argv[3];
00543
00544
for(
unsigned int i=0;i<modelaliases.size();i++)
00545 {
00546 cout<<
"*** Doing job for alias : "<< modelaliases[i]<<
endl;
00547
use(modelaliases[i], datasetalias);
00548 }
00549 }
00550
else if(command==
"help")
00551 {
00552
string aboutwhat = argv[2];
00553
if(aboutwhat==
"datasets")
00554 cout <<
getDataSetHelp();
00555
else
00556
displayObjectHelp(cout, aboutwhat);
00557 }
00558
else if(command==
"listmodels")
00559 {
00560
if(!
isfile(
"model.aliases"))
00561
exitmsg(
"Problem: No model.aliases file in current directory");
00562
string mod;
00563
if(argc==2)
00564 mod=
"*";
00565
else
00566 mod=argv[2];
00567
vector<string> ali =
getMultipleModelAliases(mod);
00568 cout<<
"Model aliases found in model.aliases:"<<
endl;
00569
for(
unsigned int i=0;i<ali.size();i++)
00570 cout<<ali[i]<<
endl;
00571 }
00572
00573 PLMPI::finalize();
00574
return 0;
00575
00576 }
00577
00578 }