00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
#include "databases.h"
00045
#include <plearn/vmat/ConcatRowsVMatrix.h>
00046
#include "NistDB.h"
00047
#include <plearn/math/random.h>
00048
#include <plearn/vmat/RemapLastColumnVMatrix.h>
00049
#include <plearn/vmat/ShiftAndRescaleVMatrix.h>
00050
#include <plearn/vmat/Splitter.h>
00051
#include <plearn/vmat/VMat_maths.h>
00052
00053
namespace PLearn {
00054
using namespace std;
00055
00056
00057 #define JAVA "java"
00058
00059 Mat input2dSet(
const string& filename)
00060 {
00061
Mat data;
00062
if(!
file_exists(filename))
00063 {
00064
string systemstring =
string(
JAVA) +
" InputPoints " + filename +
" -1 1 -1 1";
00065 system(systemstring.c_str());
00066 }
00067
loadAscii(filename, data);
00068
shuffleRows(data);
00069
return data;
00070 }
00071
00072
00073 void normalizeDataSets(
Mat& training_set,
Mat& validation_set,
Mat& test_set)
00074 {
00075
int inputsize = training_set.
width()-1;
00076
Mat training_inputs = training_set.
subMatColumns(0,inputsize);
00077
Vec meanvec(inputsize);
00078
Vec stddevvec(inputsize);
00079
computeMeanAndStddev(training_inputs, meanvec, stddevvec);
00080 training_inputs -= meanvec;
00081 training_inputs /= stddevvec;
00082
Mat validation_inputs = validation_set.
subMatColumns(0,inputsize);
00083 validation_inputs -= meanvec;
00084 validation_inputs /= stddevvec;
00085
Mat test_inputs = test_set.
subMatColumns(0,inputsize);
00086 test_inputs -= meanvec;
00087 test_inputs /= stddevvec;
00088 }
00089
00090 void normalizeDataSets(
VMat& training_set,
VMat& validation_set,
VMat& test_set)
00091 {
00092
int inputsize = training_set.
width()-1;
00093
Mat training_inputs = training_set.
subMatColumns(0,inputsize);
00094
Vec meanvec(inputsize);
00095
Vec stddevvec(inputsize);
00096
computeMeanAndStddev(training_inputs, meanvec, stddevvec);
00097 training_inputs -= meanvec;
00098 training_inputs /= stddevvec;
00099
Mat validation_inputs = validation_set.
subMatColumns(0,inputsize);
00100 validation_inputs -= meanvec;
00101 validation_inputs /= stddevvec;
00102
Mat test_inputs = test_set.
subMatColumns(0,inputsize);
00103 test_inputs -= meanvec;
00104 test_inputs /= stddevvec;
00105 }
00106
00107
00108 void normalizeDataSets(
Mat& training_set,
Mat& test_set)
00109 {
00110
int inputsize = training_set.
width()-1;
00111
Mat training_inputs = training_set.
subMatColumns(0,inputsize);
00112
Vec meanvec(inputsize);
00113
Vec stddevvec(inputsize);
00114
computeMeanAndStddev(training_inputs, meanvec, stddevvec);
00115 training_inputs -= meanvec;
00116 training_inputs /= stddevvec;
00117
Mat test_inputs = test_set.
subMatColumns(0,inputsize);
00118 test_inputs -= meanvec;
00119 test_inputs /= stddevvec;
00120 }
00121
00122 void normalizeDataSet(
Mat& m)
00123 {
00124
Vec meanvec(m.
width());
00125
Vec stddevvec(m.
width());
00126
computeMeanAndStddev(m,meanvec,stddevvec);
00127 m -= meanvec;
00128 m /= stddevvec;
00129 }
00130 void splitTrainValidTest(
VMat &data_set,
VMat &train_set,
VMat &valid_set,
00131
real valid_fraction,
VMat &test_set,
real test_fraction,
00132
bool normalize)
00133 {
00134
int nvalid =
int((
real)data_set.
length()*valid_fraction);
00135
int ntest = int((
real)data_set.
length()*test_fraction);
00136
int ntrain = data_set.
length()-(nvalid+ntest);
00137
00138 train_set = data_set.
subMatRows(0,ntrain);
00139 valid_set = data_set.
subMatRows(ntrain, nvalid);
00140 test_set = data_set.
subMatRows(ntrain+nvalid,ntest);
00141
if (
normalize){
00142
VMat train_set_inputs=train_set.
subMatColumns(0,data_set.
width()-1);
00143
VMat valid_set_inputs=valid_set.
subMatColumns(0,data_set.
width()-1);
00144
VMat test_set_inputs = test_set.
subMatColumns(0,data_set.
width()-1);
00145
normalizeDataSets(train_set_inputs,valid_set_inputs,test_set_inputs);
00146 }
00147 }
00148 VMat reduceInputSize(
real fraction,
VMat data)
00149 {
00150
int n_inputs=data->
width()-1;
00151
int reduce_n_inputs=(
int)(fraction*n_inputs);
00152 cout<<
"use "<<reduce_n_inputs<<
" of "<<n_inputs<<
endl;
00153
VMat new_data = data.
subMatColumns(n_inputs-reduce_n_inputs,1+reduce_n_inputs);
00154
return new_data;
00155 }
00156 VMat reduceDataSetSize(
real fraction,
VMat data)
00157 {
00158
int n_examples=data->
length();
00159
int new_n_examples=(
int)(fraction*n_examples);
00160
return data.
subMatRows(0,new_n_examples);
00161 }
00162
00163
00164 void remapClassnums(
VMat& data,
real remap_minval_to,
real remap_maxval_to)
00165 {
00166
00167
int inputsize = data.
width()-1;
00168
for(
int i=0; i<data.
length(); i++)
00169 {
00170
if(data(i,inputsize)<=0.0)
00171 data->put(i,inputsize,remap_minval_to);
00172
else
00173 data->put(i,inputsize,remap_maxval_to);
00174 }
00175 }
00176
#ifdef DBDIR
00177
const static string dbdir_name = DBDIR;
00178
#else
00179 const static string dbdir_name =
"";
00180
#endif
00181
00182 VMat loadBreastCancerWisconsin(
bool normalize,
bool uniq)
00183 {
00184
Mat data;
00185
if(uniq)
00186
loadAscii(
dbdir_name+
"/Breast/breast-cancer-wisconsin-uniq.amat",data);
00187
else
00188
loadAscii(
dbdir_name+
"/Breast/breast-cancer-wisconsin.amat",data);
00189
if(
normalize)
00190 {
00191
Mat datainput = data.
subMatColumns(0,data.
width()-1);
00192
normalizeDataSet(datainput);
00193 }
00194
shuffleRows(data);
00195
return VMat(data);
00196 }
00197
00198 int loadBreastCancer(
VMat& training_set,
VMat& validation_set,
VMat& test_set,
int ntrain,
int nvalid,
bool uniq)
00199 {
00200
Mat data;
00201
if(uniq)
00202
loadAscii(
dbdir_name +
"/Breast/breast-cancer-wisconsin-uniq.amat",data);
00203
else
00204
loadAscii(
dbdir_name +
"/Breast/breast-cancer-wisconsin.amat",data);
00205
00206
shuffleRows(data);
00207
00208
00209
int ntest = data.
length()-(ntrain+nvalid);
00210
Mat training_data = data.
subMatRows(0,ntrain);
00211
Mat validation_data = data.
subMatRows(ntrain, nvalid);
00212
Mat test_data = data.
subMatRows(ntrain+nvalid,ntest);
00213
00214
00215
normalizeDataSets(training_data,validation_data,test_data);
00216
00217 training_set =
VMat(training_data);
00218 validation_set = VMat(validation_data);
00219 test_set = VMat(test_data);
00220
return 2;
00221 }
00222
00223 VMat loadPimaIndians(
bool normalize)
00224 {
00225
Mat data =
loadUCIMLDB(
dbdir_name +
"/UCI_MLDB/pima-indians-diabetes/pima-indians-diabetes.data");
00226
if(
normalize)
00227 {
00228
Mat datainput = data.
subMatColumns(0,data.
width()-1);
00229
normalizeDataSet(datainput);
00230 }
00231
shuffleRows(data);
00232
return VMat(data);
00233 }
00234
00235 VMat loadHousing(
bool normalize)
00236 {
00237
Mat data;
00238
loadGnuplot(
dbdir_name +
"/UCI_MLDB/housing/housing.data", data);
00239
Mat inputs = data.
subMatColumns(0,13);
00240
Mat targets = data.
subMatColumns(13,1);
00241
if (
normalize)
00242 {
00243
00244
normalizeDataSet(inputs);
00245
00246 targets *=
real(0.01);
00247 }
00248
return VMat(data);
00249 }
00250
00251 VMat loadSonar()
00252 {
00253
Mat data =
loadUCIMLDB(
dbdir_name +
"/UCI_MLDB/undocumented/connectionist-bench/sonar/sonar.all-data");
00254
shuffleRows(data);
00255
00256
return VMat(data);
00257 }
00258
00259 VMat loadIonosphere()
00260 {
00261
Mat data =
loadUCIMLDB(
dbdir_name +
"/UCI_MLDB/ionosphere/ionosphere.data");
00262
shuffleRows(data);
00263
00264
return VMat(data);
00265 }
00266
00267 VMat loadDiabetes(
bool normalize)
00268 {
00269
Mat data;
00270
loadAscii(
dbdir_name +
"/Diabetes/diabetes.amat",data);
00271
00272
if(
normalize)
00273 {
00274
Mat datainput = data.
subMatColumns(0,data.
width()-1);
00275
normalizeDataSet(datainput);
00276 }
00277
shuffleRows(data);
00278
return VMat(data);
00279 }
00280
00281 int loadDiabetes(
VMat& training_set,
VMat& validation_set,
VMat& test_set,
int ntrain,
int nvalid)
00282 {
00283
Mat data;
00284
loadAscii(
dbdir_name +
"/Diabetes/diabetes.amat",data);
00285
00286
shuffleRows(data);
00287
00288
00289
int ntest = data.
length()-(ntrain+nvalid);
00290
Mat training_data = data.
subMatRows(0,ntrain);
00291
Mat validation_data = data.
subMatRows(ntrain, nvalid);
00292
Mat test_data = data.
subMatRows(ntrain+nvalid,ntest);
00293
00294
00295
normalizeDataSets(training_data,validation_data,test_data);
00296
00297 training_set =
VMat(training_data);
00298 validation_set = VMat(validation_data);
00299 test_set = VMat(test_data);
00300
return 2;
00301 }
00302
00303 int loadATT800(
VMat& training_set,
VMat& test_set)
00304 {
00305
Mat data;
00306
loadAscii(
dbdir_name +
"/ATT800/att800.amat",data);
00307
00308
00309
Mat durations = data.
subMatColumns(0,12);
00310
Mat daytimes = data.
subMatColumns(12,24);
00311
Mat classnums = data.
column(36);
00312
00313
Mat newdata(data.
length(), data.
width()+2);
00314
Mat new_total_durations = newdata.
column(0);
00315
Mat new_durations = newdata.
subMatColumns(1,12);
00316
Mat new_total_daytimes = newdata.
column(13);
00317
Mat new_daytimes = newdata.
subMatColumns(14,24);
00318
Mat new_classnums = newdata.
column(38);
00319
00320 new_durations << durations;
00321 new_daytimes << daytimes;
00322 new_classnums << classnums;
00323
for(
int i=0; i<data.
length(); i++)
00324 {
00325 new_total_durations(i,0) =
sum(new_durations(i));
00326
if(new_total_durations(i,0) > 0.0)
00327 {
00328
Vec new_durations_i = new_durations(i);
00329 new_durations_i /= new_total_durations(i,0);
00330 }
00331 new_total_daytimes(i,0) =
sum(new_daytimes(i));
00332
if(new_total_daytimes(i,0) > 0.0)
00333 {
00334
Vec new_daytimes_i = new_daytimes(i);
00335 new_daytimes_i /= new_total_daytimes(i,0);
00336 }
00337 }
00338
00339
shuffleRows(newdata);
00340
Mat training_data = newdata.
subMatRows(0,400);
00341
Mat test_data = newdata.
subMatRows(100,185);
00342
00343
00344
normalizeDataSets(training_data,test_data);
00345
00346 training_set =
VMat(training_data);
00347 test_set = VMat(test_data);
00348
return 2;
00349 }
00350
00351 VMat loadLetters(
bool normalize)
00352 {
00353
Mat letters;
00354
loadAscii(
dbdir_name +
"/Letter/letter.amat",letters);
00355
00356
if(
normalize)
00357 {
00358
Mat datainput = letters.
subMatColumns(0,letters.
width()-1);
00359
normalizeDataSet(datainput);
00360 }
00361
00362
return VMat(letters);
00363 }
00364
00365
00366 VMat loadLetters(
const char* class0,
const char* class1,
bool normalize)
00367 {
00368
int letter_classnum[26];
00369
for(
int l=0; l<26; l++)
00370 letter_classnum[l] = -1;
00371
for(
unsigned int i=0; i<
strlen(class0); i++)
00372 letter_classnum[class0[i]-
'A'] = 0;
00373
for(
unsigned int i=0; i<
strlen(class1); i++)
00374 letter_classnum[class1[i]-
'A'] = 1;
00375
00376
Mat letters;
00377
loadAscii(
dbdir_name +
"/Letter/letter.amat",letters);
00378
00379
int nkeptsamples = 0;
00380
for(
int i=0; i<letters.
length(); i++)
00381
if(letter_classnum[
int(letters(i,letters.
width()-1))] >= 0)
00382 nkeptsamples++;
00383
00384
Mat keptletters(nkeptsamples, letters.
width());
00385
int n = 0;
00386
for(
int i=0; i<letters.
length(); i++)
00387 {
00388
int classnum = letter_classnum[int(letters(i,letters.
width()-1))];
00389
if(classnum >= 0)
00390 {
00391 keptletters(n) << letters(i);
00392 keptletters(n,keptletters.
width()-1) = classnum;
00393 n++;
00394 }
00395 }
00396
00397
if(
normalize)
00398 {
00399
Mat datainput = keptletters.
subMatColumns(0,keptletters.
width()-1);
00400
normalizeDataSet(datainput);
00401 }
00402
00403
return VMat(keptletters);
00404 }
00405
00406 int loadLetters(
VMat& training_set,
VMat& validation_set,
VMat& test_set,
char* which_letters,
real validation_fraction,
real test_fraction,
bool do_shuffle)
00407 {
00408
int letter_classnum[26];
00409
for(
int l=0; l<26; l++)
00410 letter_classnum[l] = -1;
00411
int classnum = 0;
00412
for(
unsigned int i=0; i<
strlen(which_letters); i++)
00413 letter_classnum[which_letters[i]-
'A'] = classnum++;
00414
00415
Mat letters;
00416
loadAscii(
dbdir_name +
"/Letter/letter.amat",letters);
00417
00418
Mat keptletters(letters.
length(),letters.
width());
00419
int k=0;
00420
for(
int i=0; i<letters.
length(); i++)
00421 {
00422
int c = letter_classnum[(
int)letters(i,letters.
width()-1)];
00423
if(c!=-1)
00424 {
00425 keptletters(
k) << letters(i);
00426 keptletters(
k,keptletters.
width()-1) = c;
00427
k++;
00428 }
00429 }
00430 keptletters.
resize(
k,letters.
width());
00431
00432 letters = keptletters.
copy();
00433
00434
00435 keptletters =
Mat();
00436
if (do_shuffle){
00437
shuffleRows(letters);
00438 }
00439
int nvalid =
int((
real)letters.
length()*validation_fraction);
00440
int ntest = int((
real)letters.
length()*test_fraction);
00441
int ntrain = letters.
length()-(nvalid+ntest);
00442
00443
Mat training_data = letters.
subMatRows(0,ntrain);
00444
Mat validation_data = letters.
subMatRows(ntrain, nvalid);
00445
Mat test_data = letters.
subMatRows(ntrain+nvalid,ntest);
00446
00447
00448
normalizeDataSets(training_data,validation_data,test_data);
00449
00450 training_set =
VMat(training_data);
00451 validation_set = VMat(validation_data);
00452 test_set = VMat(test_data);
00453
return strlen(which_letters);
00454 }
00455 VMat loadLetters(
int n_letters,
bool do_shuffle)
00456 {
00457
if (n_letters > 26 || n_letters < 1)
00458
PLERROR(
"In loadLetters: alphabet is at most 26 letters (and at least 1 letter)!");
00459
int letter_classnum[26];
00460
for(
int l=0; l<26; l++)
00461 letter_classnum[l] = -1;
00462
int classnum = 0;
00463
int letter = 0;
00464
for(
int i=0; i<n_letters; i++)
00465 letter_classnum[letter++] = classnum++;
00466
00467
Mat letters;
00468
loadAscii(
dbdir_name +
"/Letter/letter.amat",letters);
00469
00470
Mat keptletters(letters.
length(),letters.
width());
00471
int k=0;
00472
for(
int i=0; i<letters.
length(); i++)
00473 {
00474
int c = letter_classnum[(
int)letters(i,letters.
width()-1)];
00475
if(c!=-1)
00476 {
00477 keptletters(
k) << letters(i);
00478 keptletters(
k,keptletters.
width()-1) = c;
00479
k++;
00480 }
00481 }
00482 keptletters.
resize(
k,letters.
width());
00483
00484 letters = keptletters.
copy();
00485
00486
00487 keptletters =
Mat();
00488
if (do_shuffle){
00489
shuffleRows(letters);
00490 }
00491
return VMat(letters);
00492 }
00493
00494 int loadLetters(
VMat& training_set,
VMat& validation_set,
VMat& test_set,
int n_letters,
real validation_fraction,
real test_fraction,
bool do_shuffle)
00495 {
00496
VMat letters=
loadLetters(n_letters,do_shuffle);
00497
int nvalid =
int((
real)letters.
length()*validation_fraction);
00498
int ntest = int((
real)letters.
length()*test_fraction);
00499
int ntrain = letters.
length()-(nvalid+ntest);
00500
00501
Mat training_data = letters.
subMatRows(0,ntrain);
00502
Mat validation_data = letters.
subMatRows(ntrain, nvalid);
00503
Mat test_data = letters.
subMatRows(ntrain+nvalid,ntest);
00504
00505
00506
normalizeDataSets(training_data,validation_data,test_data);
00507
00508 training_set =
VMat(training_data);
00509 validation_set = VMat(validation_data);
00510 test_set = VMat(test_data);
00511
return n_letters;
00512 }
00513
00514 void loadCorelDatamat(
int classnum,
Mat& train,
Mat& valid,
Mat& test)
00515 {
00516
char filename[1000];
00517
int len;
00518
int width = 16*16*16*2;
00519
00520
00521 {
00522 sprintf(filename,(
dbdir_name +
"/Corel/train/size%d").
c_str(),classnum);
00523 ifstream sizein(filename);
00524 sizein >> len;
00525
Mat datamat(len, width);
00526
00527 sprintf(filename,(
dbdir_name +
"/Corel/train/histo%d").
c_str(),classnum);
00528 ifstream datain(filename);
00529
#ifdef USEFLOAT
00530
datain.read((
char*)datamat.
data(), len*width*4);
00531
#ifdef LITTLEENDIAN
00532
reverse_float(datamat.
data(), len*width);
00533
#endif
00534
#else
00535
PLERROR(
"In loadCorelDatamat USEDOUBLE case not yet implemented correctly");
00536
#endif
00537
00538 train.
resize(len,width/2);
00539
for(
int i=0; i<train.
length(); i++)
00540
for(
int j=0; j<train.
width(); j++)
00541 train(i,j) = datamat(i,2*j);
00542 }
00543
00544
00545 {
00546 sprintf(filename,(
dbdir_name +
"/Corel/valid/size%d").
c_str(),classnum);
00547 ifstream sizein(filename);
00548 sizein >> len;
00549
Mat datamat(len, width);
00550
00551 sprintf(filename,(
dbdir_name +
"/Corel/valid/histo%d").
c_str(),classnum);
00552 ifstream datain(filename);
00553
#ifdef USEFLOAT
00554
datain.read((
char*)datamat.
data(), len*width*4);
00555
#ifdef BIGENDIAN
00556
reverse_float(datamat.
data(), len*width);
00557
#endif
00558
#else
00559
PLERROR(
"In loadCorelDatamat USEDOUBLE case not yet implemented correctly");
00560
#endif
00561
00562
00563 valid.
resize(len,width/2);
00564
for(
int i=0; i<valid.
length(); i++)
00565
for(
int j=0; j<valid.
width(); j++)
00566 valid(i,j) = datamat(i,2*j);
00567 }
00568
00569
00570 {
00571 sprintf(filename,(
dbdir_name +
"/Corel/test/size%d").
c_str(),classnum);
00572 ifstream sizein(filename);
00573 sizein >> len;
00574
Mat datamat(len, width);
00575
00576 sprintf(filename,(
dbdir_name +
"/Corel/test/histo%d").
c_str(),classnum);
00577 ifstream datain(filename);
00578
#ifdef USEFLOAT
00579
datain.read((
char*)datamat.
data(), len*width*4);
00580
#ifdef BIGENDIAN
00581
reverse_float(datamat.
data(), len*width);
00582
#endif
00583
#else
00584
PLERROR(
"In loadCorelDatamat USEDOUBLE case not yet implemented correctly");
00585
#endif
00586
00587
00588 test.
resize(len,width/2);
00589
for(
int i=0; i<test.
length(); i++)
00590
for(
int j=0; j<test.
width(); j++)
00591 test(i,j) = datamat(i,2*j);
00592 }
00593 }
00594
00595 Mat smoothCorelHisto(
Mat& data)
00596 {
00597
Mat res(data.
length(), 7*7*7);
00598
for(
int n=0; n<data.
length(); n++)
00599 {
00600
real* r = res[n];
00601
real* d = data[n];
00602
for(
int i=0; i<7; i++)
00603
for(
int j=0; j<7; j++)
00604
for(
int k=0;
k<7;
k++,r++)
00605 {
00606 *r += 0.15*d[i*2*16*16+j*2*16+
k*2];
00607 *r += 0.35*d[(i*2+1)*16*16+(j*2+1)*16+
k*2+1];
00608 *r += 0.35*d[(i*2+2)*16*16+(j*2+2)*16+
k*2+2];
00609 *r += 0.15*d[(i*2+3)*16*16+(j*2+3)*16+
k*2+3];
00610 }
00611 }
00612
return res;
00613 }
00614
00615 void loadCorel(
Mat& training_set,
Mat& validation_set,
Mat& test_set,
int negative_class,
int positive_class)
00616 {
00617
00618
00619
00620
Mat trainA, validA, testA;
00621
Mat trainB, validB, testB;
00622
00623
loadCorelDatamat(negative_class, trainA, validA, testA);
00624 trainA =
smoothCorelHisto(trainA);
00625 validA =
smoothCorelHisto(validA);
00626 testA =
smoothCorelHisto(testA);
00627
loadCorelDatamat(positive_class, trainB, validB, testB);
00628 trainB =
smoothCorelHisto(trainB);
00629 validB =
smoothCorelHisto(validB);
00630 testB =
smoothCorelHisto(testB);
00631
int inputsize = trainA.
width();
00632
00633 training_set.
resize(trainA.
length()+trainB.
length(), inputsize+1);
00634
Mat trainingAinputs = training_set.
subMat(0, 0, trainA.
length(), inputsize);
00635
Mat trainingAclassnums = training_set.
subMat(0, inputsize, trainA.
length(), 1);
00636
Mat trainingBinputs = training_set.
subMat(trainA.
length(), 0, trainB.
length(), inputsize);
00637
Mat trainingBclassnums = training_set.
subMat(trainA.
length(), inputsize, trainB.
length(), 1);
00638 trainingAinputs << trainA;
00639 trainingAclassnums.
fill(0.0);
00640 trainingBinputs << trainB;
00641 trainingBclassnums.
fill(1.0);
00642
shuffleRows(training_set);
00643
00644 validation_set.
resize(validA.
length()+validB.
length(), inputsize+1);
00645
Mat validAinputs = validation_set.
subMat(0, 0, validA.
length(), inputsize);
00646
Mat validAclassnums = validation_set.
subMat(0, inputsize, validA.
length(), 1);
00647
Mat validBinputs = validation_set.
subMat(validA.
length(), 0, validB.
length(), inputsize);
00648
Mat validBclassnums = validation_set.
subMat(validA.
length(), inputsize, validB.
length(), 1);
00649 validAinputs << validA;
00650 validAclassnums.
fill(0.0);
00651 validBinputs << validB;
00652 validBclassnums.
fill(1.0);
00653
shuffleRows(validation_set);
00654
00655 test_set.
resize(testA.
length()+testB.
length(), inputsize+1);
00656
Mat testAinputs = test_set.
subMat(0, 0, testA.
length(), inputsize);
00657
Mat testAclassnums = test_set.
subMat(0, inputsize, testA.
length(), 1);
00658
Mat testBinputs = test_set.
subMat(testA.
length(), 0, testB.
length(), inputsize);
00659
Mat testBclassnums = test_set.
subMat(testA.
length(), inputsize, testB.
length(), 1);
00660 testAinputs << testA;
00661 testAclassnums.
fill(0.0);
00662 testBinputs << testB;
00663 testBclassnums.
fill(1.0);
00664
shuffleRows(test_set);
00665 }
00666
00667 void loadCallxx(
int year,
VMat& d)
00668 {
00669
Mat data;
00670
char filename[1000];
00671 sprintf(filename,(
dbdir_name +
"/Finance/call%d.stc.data").
c_str(),year);
00672
loadAscii(filename, data);
00673 d =
VMat(data);
00674 }
00675
00676
00677 void loadUSPS(
VMat& trainset,
VMat& testset,
bool use_smooth)
00678 {
00679
Mat traininputs;
00680
Mat testinputs;
00681
Mat traindesired;
00682
Mat testdesired;
00683
00684
if(use_smooth)
00685 {
00686 traininputs =
loadSNMat(
dbdir_name +
"/usps/train-patterns-smoo.mat");
00687 testinputs =
loadSNMat(
dbdir_name +
"/usps/test-patterns-smoo.mat");
00688 }
00689
else
00690 {
00691 traininputs =
loadSNMat(
dbdir_name +
"/usps/ocr16-train.mat");
00692 testinputs =
loadSNMat(
dbdir_name +
"/usps/ocr16-test.mat");
00693 }
00694
00695
00696
00697
00698
00699 traindesired =
loadSNMat(
dbdir_name +
"/usps/train-desired.mat");
00700
Mat trainclasses(traininputs.
length(),1);
00701
for(
int i=0; i<traindesired.
length(); i++)
00702 trainclasses(i,0) =
argmax(traindesired(i));
00703
00704 testdesired =
loadSNMat(
dbdir_name +
"/usps/test-desired.mat");
00705
Mat testclasses(testinputs.
length(),1);
00706
for(
int i=0; i<testdesired.
length(); i++)
00707 testclasses(i,0) =
argmax(testdesired(i));
00708
00709 trainset =
hconcat(traininputs,trainclasses);
00710 testset =
hconcat(testinputs,testclasses);
00711 }
00712
00713 VMat loadUSPS(
bool use_smooth)
00714 {
00715
Mat traininputs;
00716
Mat traindesired;
00717
00718
if(use_smooth)
00719 traininputs =
loadSNMat(
dbdir_name +
"/usps/patterns-smoo.mat");
00720
else
00721 traininputs =
loadSNMat(
dbdir_name +
"/usps/ocr16.pat");
00722
00723 traininputs +=
real(1.0);
00724 traininputs /= real(2.0);
00725
00726 traindesired =
loadSNMat(
dbdir_name +
"/usps/desired.mat");
00727
Mat trainclasses(traininputs.
length(),1);
00728
for(
int i=0; i<traindesired.
length(); i++)
00729 trainclasses(i,0) =
argmax(traindesired(i));
00730
00731
Mat trainset =
hconcat(traininputs,trainclasses);
00732
00733
return trainset;
00734 }
00735
00736 void loadLetters(
int& inputsize,
int& nclasses,
VMat& trainset,
VMat& testset)
00737 {
00738
Mat letters;
00739
loadAscii(
dbdir_name +
"/Letter/letter.amat",letters);
00740 inputsize = letters.
width()-1;
00741 nclasses = 26;
00742 trainset =
VMat(letters.
subMatRows(0,16000));
00743 testset = VMat(letters.
subMatRows(16000,4000));
00744 }
00745
00746 void loadClassificationDataset(
const string& datasetname,
int& inputsize,
int& nclasses,
VMat& trainset,
VMat& testset,
bool normalizeinputs,
VMat& allset)
00747 {
00748
string dbname = datasetname;
00749
int reduced_size = 0;
00750
vector<string> dataset_and_size =
split(dbname,
":");
00751
if(dataset_and_size.size()==2)
00752 {
00753 dbname = dataset_and_size[0];
00754 reduced_size =
toint(dataset_and_size[1]);
00755 }
00756
00757
if(dbname==
"2d")
00758 {
00759 trainset =
input2dSet();
00760
Mat mapping(2,2); mapping <<
"-1 0 1 1";
00761 trainset =
remapLastColumn(trainset,mapping);
00762 testset = trainset;
00763 inputsize = 2;
00764 nclasses = 2;
00765 }
00766
else if(dbname==
"letters")
00767 {
00768
loadLetters(inputsize, nclasses, trainset, testset);
00769 }
00770
else if(dbname==
"breast")
00771 {
00772
VMat dbname =
loadBreastCancerWisconsin();
00773 inputsize = dbname.
width()-1;
00774 nclasses = 2;
00775
split(dbname,0.5,trainset,testset);
00776 }
00777
else if(dbname==
"usps")
00778 {
00779
loadUSPS(trainset,testset,
true);
00780 inputsize = trainset.
width()-1;
00781 nclasses = 10;
00782 }
00783
else if(dbname==
"mnist")
00784 {
00785
loadMNIST(trainset,testset);
00786 inputsize = trainset.
width()-1;
00787 nclasses = 10;
00788 }
00789
else if(dbname==
"mnist_override")
00790 {
00791
loadMNIST(trainset,testset);
00792 inputsize = trainset.
width()-1;
00793 nclasses = 10;
00794
Mat m;
00795 m.
load(
"mnist_override.pmat");
00796
if(m.
width() != inputsize+1)
00797
PLERROR(
"mnist_overrid.pmat is espected to have a width of %d, but has %d",inputsize+1,m.
width());
00798 trainset =
VMat(m);
00799 }
00800
else if(dbname.length()==5 && dbname.substr(0,4)==
"usps" && dbname[4]>=
'0' && dbname[4]<=
'9')
00801 {
00802
int classnum = dbname[4]-
'0';
00803
loadUSPS(trainset,testset,
true);
00804 inputsize = trainset.
width()-1;
00805 trainset =
remapLastColumn(trainset,classnum,1,0);
00806 testset =
remapLastColumn(testset,classnum,1,0);
00807 nclasses = 2;
00808 }
00809
else if(dbname.length()==5 && dbname.substr(0,4)==
"mnist" && dbname[4]>=
'0' && dbname[4]<=
'9')
00810 {
00811
int classnum = dbname[4]-
'0';
00812
loadMNIST(trainset,testset);
00813 inputsize = trainset.
width()-1;
00814 trainset =
remapLastColumn(trainset,classnum,1.,0.);
00815 testset =
remapLastColumn(testset,classnum,1.,0.);
00816 nclasses = 2;
00817 }
00818
else if (dbname.substr(0,4) ==
"UCI_") {
00819
string db_spec;
00820
string type;
00821
if (dbname.substr(0,8) ==
"UCI_KDD_") {
00822 db_spec = dbname.substr(8);
00823 type =
"KDD";
00824 }
else {
00825 db_spec = dbname.substr(4);
00826 type =
"MLDB";
00827 }
00828
00829 size_t look_for_id = db_spec.rfind(
"_ID=");
00830
string db_dir;
00831
string id =
"";
00832
if (look_for_id != string::npos) {
00833
00834 db_dir = db_spec.substr(0, look_for_id);
00835
id = db_spec.substr(look_for_id + 4);
00836 }
else {
00837 db_dir = db_spec;
00838 }
00839
loadUCI(trainset, testset, allset, db_dir,
id, normalizeinputs,type);
00840 }
00841
else
00842
PLERROR(
"Unknown dbname %s",dbname.c_str());
00843
00844
if(reduced_size)
00845 {
00846 trainset = trainset.
subMatRows(0,reduced_size);
00847 testset = testset.
subMatRows(0,reduced_size);
00848 }
00849
00850
if(normalizeinputs)
00851 {
00852
Vec meanvec;
00853
Vec stddevvec;
00854
computeMeanAndStddev(trainset, meanvec, stddevvec);
00855 meanvec = meanvec.
subVec(0,inputsize);
00856 stddevvec = stddevvec.
subVec(0,inputsize);
00857
for (
int i = 0; i < stddevvec.
length(); i++) {
00858
if (stddevvec[i] == 0) {
00859
00860
00861 stddevvec[i] = 1;
00862 }
00863 }
00864
for (
int i=0;i<inputsize;i++)
00865
if (stddevvec[i]==0) stddevvec[i]=1;
00866 trainset =
normalize(trainset,meanvec,stddevvec);
00867 testset =
normalize(testset,meanvec,stddevvec);
00868 }
00869 }
00870
00871
00873
00875 void loadUCI(
VMat& trainset,
VMat& testset,
VMat& allset,
string db_spec,
string id,
bool &normalize,
const string& type) {
00876
string script_file = db_spec;
00877
if (
id !=
"") {
00878 script_file +=
"_ID=" +
id;
00879 }
00880 script_file +=
".plearn";
00881
string db_dir;
00882
if (type==
"MLDB") {
00883 db_dir =
dbdir_name +
"/UCI_MLDB/" + db_spec;
00884 }
else if (type==
"KDD") {
00885 db_dir =
dbdir_name +
"/UCI_KDD/" + db_spec;
00886 }
else {
00887
PLERROR(
"In loadUCI: Unknown dataset type: %s.",type.c_str());
00888 }
00889
Object* obj =
PLearn::macroLoadObject(db_dir +
"/" + script_file);
00890
PP<UCISpecification> uci_spec = static_cast<UCISpecification*>(obj);
00891
if (uci_spec->file_train !=
"") {
00892
if (uci_spec->format==
"UCI") {
00893
loadUCISet(trainset, db_dir +
"/" + uci_spec->file_train, uci_spec);
00894 }
else if (uci_spec->format==
"AMAT") {
00895
loadUCIAMat(trainset,db_dir +
"/" + uci_spec->file_train, uci_spec);
00896 }
else {
00897
PLERROR(
"In loadUCI: Format '%s' unsupported",uci_spec->format.c_str());
00898 }
00899 }
00900
if (uci_spec->file_test !=
"") {
00901
if (uci_spec->format==
"UCI") {
00902
loadUCISet(testset, db_dir +
"/" + uci_spec->file_test, uci_spec);
00903 }
else if (uci_spec->format==
"AMAT") {
00904
loadUCIAMat(testset,db_dir +
"/" + uci_spec->file_test, uci_spec);
00905 }
else {
00906
PLERROR(
"In loadUCI: Format '%s' unsupported",uci_spec->format.c_str());
00907 }
00908 }
00909
if (uci_spec->file_all !=
"") {
00910
if (uci_spec->format==
"UCI") {
00911
loadUCISet(allset, db_dir +
"/" + uci_spec->file_all, uci_spec);
00912 }
else if (uci_spec->format==
"AMAT") {
00913
loadUCIAMat(allset, db_dir +
"/" + uci_spec->file_all, uci_spec);
00914 }
else {
00915
PLERROR(
"In loadUCI: Format '%s' unsupported",uci_spec->format.c_str());
00916 }
00917 }
else {
00918 allset =
vconcat(trainset, testset);
00919 }
00920
if (
normalize) {
00921
int is = uci_spec->inputsize;
00922
if (is == -1)
00923 is = allset->
width() - 1;
00924
VMat tmp_vmat =
new ShiftAndRescaleVMatrix(allset, is, 0,
true, 0);
00925
Mat new_data = tmp_vmat->
toMat().
subMatColumns(0, is);
00926 allset->putMat(0, 0, new_data);
00927
if (trainset && testset) {
00928
if (allset->
length() != trainset->
length() + testset->
length())
00929
PLERROR(
"In loadUCI - The whole dataset should have a length equal to train + test");
00930 trainset->putMat(0, 0, new_data.
subMatRows(0, trainset->
length()));
00931 testset->putMat(0, 0, new_data.
subMatRows(trainset->
length(), testset->
length()));
00932 }
else if (trainset || testset) {
00933
PLERROR(
"In loadUCI - There can't be only a train set or only a test set");
00934 }
00935
00936
normalize =
false;
00937 }
00938 }
00939
00940
00941
00943
00945 void loadUCIAMat(
VMat& data,
string file,
PP<UCISpecification> uci_spec)
00946 {
00947 data =
loadAsciiAsVMat(file);
00948
00949
if (uci_spec->target_is_first) {
00950
00951
int ts = uci_spec->targetsize;
00952
if (ts == -1) {
00953
PLERROR(
"In loadUCIAMat - We don't know how many columns to move");
00954 }
00955
if (uci_spec->weightsize > 0) {
00956
PLERROR(
"In loadUCIAMat - Damnit, I don't like weights");
00957 }
00958
Vec row;
00959
Vec target;
00960
00961 target.
resize(ts);
00962
for (
int i = 0; i < data.
length(); i++) {
00963 row = data(i);
00964 target << row.
subVec(0,ts);
00965 row.
subVec(0, data.width() - ts ) << row.
subVec(ts, data.width() - ts);
00966 row.
subVec(data.width() - ts , ts) << target;
00967 data->putRow(i,row);
00968 }
00969
00970
00971
TVec<map<string,real> > sym;
00972
int is = data.
width()-ts;
00973 sym.
resize(ts);
00974
for (
int i=0;i<ts;i++) {
00975 sym[i] = data->getStringToRealMapping(i);
00976 }
00977
for(
int i=0;i<is; i++) {
00978 data->setStringMapping(i, data->getStringToRealMapping(i+ts));
00979 }
00980
for(
int i=is;i<is+ts;i++) {
00981 data->setStringMapping(i,sym[i-is]);
00982 }
00983
00984
00985 }
00986 }
00987
00989
00991 void loadUCISet(
VMat& data,
string file,
PP<UCISpecification> uci_spec) {
00992
char *** to_symbols;
00993
int * to_n_symbols;
00994
TVec<int> max_in_col;
00995
TVec<string> header_columns;
00996
Mat the_data;
00997
if (uci_spec->header_exists) {
00998 the_data =
loadUCIMLDB(file, &to_symbols, &to_n_symbols, &max_in_col,&header_columns);
00999 }
else {
01000 the_data =
loadUCIMLDB(file, &to_symbols, &to_n_symbols, &max_in_col);
01001 }
01002
if (uci_spec->target_is_first) {
01003
01004
int ts = uci_spec->targetsize;
01005
if (ts == -1) {
01006
PLERROR(
"In loadUCISet - We don't know how many columns to move");
01007 }
01008
if (uci_spec->weightsize > 0) {
01009
PLERROR(
"In loadUCISet - Damnit, I don't like weights");
01010 }
01011
Vec row;
01012
Vec target;
01013
01014 target.
resize(ts);
01015
for (
int i = 0; i < the_data.
length(); i++) {
01016 row = the_data(i);
01017 target << row.
subVec(0,ts);
01018 row.
subVec(0, the_data.width() - ts ) << row.
subVec(ts, the_data.width() - ts);
01019 row.
subVec(the_data.width() - ts , ts) << target;
01020 }
01021 }
01022 data =
VMat(the_data);
01023 data->defineSizes(uci_spec->inputsize, uci_spec->targetsize, uci_spec->weightsize);
01024
01025
if (uci_spec->header_exists) {
01026
if (uci_spec->header_fields.size()==0) {
01027
01028
if (uci_spec->target_is_first) {
01029
int ts = uci_spec->targetsize;
01030
int is = the_data.
width()-ts;
01031
TVec<string> tmp;
01032 tmp.
resize(ts);
01033 tmp << header_columns.
subVec(0,ts);
01034 header_columns.
subVec(0,is) << header_columns.
subVec(ts,is);
01035 header_columns.
subVec(is,ts) << tmp;
01036 }
01037 data->declareFieldNames(header_columns);
01038 }
else {
01039
TVec<string> field_names;
01040 field_names.
resize(the_data.
width());
01041
int last = 0;
01042
int cnt=0;
01043
for (
int i=0; i<uci_spec->header_fields.size(); i++) {
01044
for (
int j=last;j<uci_spec->header_fields[i].first;j++) {
01045 field_names[j] =
"";
01046 }
01047
for (
int j=uci_spec->header_fields[i].first;j<=uci_spec->header_fields[i].second;j++) {
01048
if (cnt>=header_columns.
size()) {
01049
PLERROR(
"In loadUCISet: 'header_fields' setting is incorrect");
01050 }
01051 field_names[j] = header_columns[cnt++];
01052 }
01053 last = uci_spec->header_fields[i].second+1;
01054 }
01055
for (
int i=last;i<field_names.
size();i++) {
01056 field_names[i] =
"";
01057 }
01058
if (uci_spec->target_is_first) {
01059
int ts = uci_spec->targetsize;
01060
int is = the_data.
width()-ts;
01061
TVec<string> tmp;
01062 tmp.
resize(ts);
01063 tmp << field_names.
subVec(0,ts);
01064 field_names.
subVec(0,is) << field_names.
subVec(ts,is);
01065 field_names.
subVec(is,ts) << tmp;
01066 }
01067 data->declareFieldNames(field_names);
01068 }
01069 }
01070
01071
01072
01073
if (uci_spec->target_is_first) {
01074
int ts = uci_spec->targetsize;
01075
int is = the_data.
width()-ts;
01076
TVec<char**> tmp_sym(ts);
01077
TVec<int> tmp_len(ts);
01078
for(
int i=0;i<ts;i++) {
01079 tmp_sym[i] = to_symbols[i];
01080 tmp_len[i] = to_n_symbols[i];
01081 }
01082
for (
int i=ts;i<is+ts;i++) {
01083 to_symbols[i-ts] = to_symbols[i];
01084 to_n_symbols[i-ts] = to_n_symbols[i];
01085 }
01086
for(
int i=is;i<is+ts;i++) {
01087 to_symbols[i] = tmp_sym[i-is];
01088 to_n_symbols[i] = tmp_len[i-is];
01089 }
01090
01091 tmp_len << max_in_col.
subVec(0,ts);
01092 max_in_col.
subVec(0,is) << max_in_col.
subVec(ts,is);
01093 max_in_col.
subVec(is,ts) << tmp_len;
01094 }
01095
for (
int j=0;j<data->
width();j++) {
01096
for (
int k=0;
k<to_n_symbols[j];
k++) {
01097 data->addStringMapping(j,
string(to_symbols[j][
k]),
real(max_in_col[j]+
k+1));
01098 }
01099 }
01100
01101
01102
for (
int i=0; i<data->
width(); i++)
01103 {
01104
for (
int j=0; j<to_n_symbols[i]; j++)
01105 free(to_symbols[i][j]);
01106 free(to_symbols[i]);
01107 }
01108 free(to_symbols);
01109 free(to_n_symbols);
01110 }
01111
01112 }