Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

databases.cc

Go to the documentation of this file.
00001 // -*- C++ -*- 00002 00003 // PLearn (A C++ Machine Learning Library) 00004 // Copyright (C) 1998 Pascal Vincent 00005 // Copyright (C) 1999,2000 Pascal Vincent, Yoshua Bengio and University of Montreal 00006 // 00007 00008 // Redistribution and use in source and binary forms, with or without 00009 // modification, are permitted provided that the following conditions are met: 00010 // 00011 // 1. Redistributions of source code must retain the above copyright 00012 // notice, this list of conditions and the following disclaimer. 00013 // 00014 // 2. Redistributions in binary form must reproduce the above copyright 00015 // notice, this list of conditions and the following disclaimer in the 00016 // documentation and/or other materials provided with the distribution. 00017 // 00018 // 3. The name of the authors may not be used to endorse or promote 00019 // products derived from this software without specific prior written 00020 // permission. 00021 // 00022 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00023 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00024 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00025 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00026 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00027 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00028 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00029 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00030 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00031 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00032 // 00033 // This file is part of the PLearn library. For more information on the PLearn 00034 // library, go to the PLearn Web site at www.plearn.org 00035 00036 00037 00038 /* ******************************************************* 00039 * $Id: databases.cc,v 1.17 2004/08/04 14:10:42 mariusmuja Exp $ 00040 * AUTHORS: Pascal Vincent 00041 * This file is part of the PLearn library. 00042 ******************************************************* */ 00043 00044 #include "databases.h" 00045 #include <plearn/vmat/ConcatRowsVMatrix.h> 00046 #include "NistDB.h" 00047 #include <plearn/math/random.h> 00048 #include <plearn/vmat/RemapLastColumnVMatrix.h> 00049 #include <plearn/vmat/ShiftAndRescaleVMatrix.h> 00050 #include <plearn/vmat/Splitter.h> 00051 #include <plearn/vmat/VMat_maths.h> 00052 00053 namespace PLearn { 00054 using namespace std; 00055 00056 00057 #define JAVA "java" 00058 00059 Mat input2dSet(const string& filename) 00060 { 00061 Mat data; 00062 if(!file_exists(filename)) 00063 { 00064 string systemstring = string(JAVA) + " InputPoints " + filename + " -1 1 -1 1"; 00065 system(systemstring.c_str()); 00066 } 00067 loadAscii(filename, data); 00068 shuffleRows(data); 00069 return data; 00070 } 00071 00072 // normalize training_set validation_set and test_set according to mean and stddev computed on training_set 00073 void normalizeDataSets(Mat& training_set, Mat& validation_set, Mat& test_set) 00074 { 00075 int inputsize = training_set.width()-1; 00076 Mat training_inputs = training_set.subMatColumns(0,inputsize); 00077 Vec meanvec(inputsize); 00078 Vec stddevvec(inputsize); 00079 computeMeanAndStddev(training_inputs, meanvec, stddevvec); 00080 training_inputs -= meanvec; 00081 training_inputs /= stddevvec; 00082 Mat validation_inputs = validation_set.subMatColumns(0,inputsize); 00083 validation_inputs -= meanvec; 00084 validation_inputs /= stddevvec; 00085 Mat test_inputs = test_set.subMatColumns(0,inputsize); 00086 test_inputs -= meanvec; 00087 test_inputs /= stddevvec; 00088 } 00089 // normalize training_set validation_set and test_set according to mean and stddev computed on training_set 00090 void normalizeDataSets(VMat& training_set, VMat& validation_set, VMat& test_set) 00091 { 00092 int inputsize = training_set.width()-1; 00093 Mat training_inputs = training_set.subMatColumns(0,inputsize); 00094 Vec meanvec(inputsize); 00095 Vec stddevvec(inputsize); 00096 computeMeanAndStddev(training_inputs, meanvec, stddevvec); 00097 training_inputs -= meanvec; 00098 training_inputs /= stddevvec; 00099 Mat validation_inputs = validation_set.subMatColumns(0,inputsize); 00100 validation_inputs -= meanvec; 00101 validation_inputs /= stddevvec; 00102 Mat test_inputs = test_set.subMatColumns(0,inputsize); 00103 test_inputs -= meanvec; 00104 test_inputs /= stddevvec; 00105 } 00106 00107 // normalize both training_set and test_set according to mean and stddev computed on training_set 00108 void normalizeDataSets(Mat& training_set, Mat& test_set) 00109 { 00110 int inputsize = training_set.width()-1; 00111 Mat training_inputs = training_set.subMatColumns(0,inputsize); 00112 Vec meanvec(inputsize); 00113 Vec stddevvec(inputsize); 00114 computeMeanAndStddev(training_inputs, meanvec, stddevvec); 00115 training_inputs -= meanvec; 00116 training_inputs /= stddevvec; 00117 Mat test_inputs = test_set.subMatColumns(0,inputsize); 00118 test_inputs -= meanvec; 00119 test_inputs /= stddevvec; 00120 } 00121 00122 void normalizeDataSet(Mat& m) // substract mean, and divide by stddev (these are estimated globally) 00123 { 00124 Vec meanvec(m.width()); 00125 Vec stddevvec(m.width()); 00126 computeMeanAndStddev(m,meanvec,stddevvec); 00127 m -= meanvec; 00128 m /= stddevvec; 00129 } 00130 void splitTrainValidTest(VMat &data_set,VMat &train_set,VMat &valid_set, 00131 real valid_fraction,VMat &test_set, real test_fraction, 00132 bool normalize) 00133 { 00134 int nvalid = int((real)data_set.length()*valid_fraction); 00135 int ntest = int((real)data_set.length()*test_fraction); 00136 int ntrain = data_set.length()-(nvalid+ntest); 00137 00138 train_set = data_set.subMatRows(0,ntrain); 00139 valid_set = data_set.subMatRows(ntrain, nvalid); 00140 test_set = data_set.subMatRows(ntrain+nvalid,ntest); 00141 if (normalize){ 00142 VMat train_set_inputs=train_set.subMatColumns(0,data_set.width()-1); 00143 VMat valid_set_inputs=valid_set.subMatColumns(0,data_set.width()-1); 00144 VMat test_set_inputs = test_set.subMatColumns(0,data_set.width()-1); 00145 normalizeDataSets(train_set_inputs,valid_set_inputs,test_set_inputs); 00146 } 00147 } 00148 VMat reduceInputSize(real fraction,VMat data) 00149 { 00150 int n_inputs=data->width()-1; 00151 int reduce_n_inputs=(int)(fraction*n_inputs); 00152 cout<<"use "<<reduce_n_inputs<<" of "<<n_inputs<<endl; 00153 VMat new_data = data.subMatColumns(n_inputs-reduce_n_inputs,1+reduce_n_inputs); 00154 return new_data; 00155 } 00156 VMat reduceDataSetSize(real fraction,VMat data) 00157 { 00158 int n_examples=data->length(); 00159 int new_n_examples=(int)(fraction*n_examples); 00160 return data.subMatRows(0,new_n_examples); 00161 } 00162 00163 // remaps classnums from {0,1} to {-1,+1} 00164 void remapClassnums(VMat& data, real remap_minval_to, real remap_maxval_to) 00165 { 00166 // Map classnums in last row from 0,1 to -1,1 00167 int inputsize = data.width()-1; 00168 for(int i=0; i<data.length(); i++) 00169 { 00170 if(data(i,inputsize)<=0.0) 00171 data->put(i,inputsize,remap_minval_to); 00172 else 00173 data->put(i,inputsize,remap_maxval_to); 00174 } 00175 } 00176 #ifdef DBDIR 00177 const static string dbdir_name = DBDIR; 00178 #else 00179 const static string dbdir_name = ""; 00180 #endif 00181 00182 VMat loadBreastCancerWisconsin(bool normalize, bool uniq) 00183 { 00184 Mat data; 00185 if(uniq) 00186 loadAscii(dbdir_name+ "/Breast/breast-cancer-wisconsin-uniq.amat",data); 00187 else 00188 loadAscii(dbdir_name+ "/Breast/breast-cancer-wisconsin.amat",data); 00189 if(normalize) 00190 { 00191 Mat datainput = data.subMatColumns(0,data.width()-1); 00192 normalizeDataSet(datainput); 00193 } 00194 shuffleRows(data); 00195 return VMat(data); 00196 } 00197 00198 int loadBreastCancer(VMat& training_set, VMat& validation_set, VMat& test_set, int ntrain, int nvalid, bool uniq) 00199 { 00200 Mat data; 00201 if(uniq) 00202 loadAscii(dbdir_name + "/Breast/breast-cancer-wisconsin-uniq.amat",data); 00203 else 00204 loadAscii(dbdir_name + "/Breast/breast-cancer-wisconsin.amat",data); 00205 00206 shuffleRows(data); 00207 00208 // split the data into training_set and test_set 00209 int ntest = data.length()-(ntrain+nvalid); 00210 Mat training_data = data.subMatRows(0,ntrain); 00211 Mat validation_data = data.subMatRows(ntrain, nvalid); 00212 Mat test_data = data.subMatRows(ntrain+nvalid,ntest); 00213 00214 // normalize the inputs 00215 normalizeDataSets(training_data,validation_data,test_data); 00216 00217 training_set = VMat(training_data); 00218 validation_set = VMat(validation_data); 00219 test_set = VMat(test_data); 00220 return 2; // 2 classes 00221 } 00222 00223 VMat loadPimaIndians(bool normalize) 00224 { 00225 Mat data = loadUCIMLDB(dbdir_name + "/UCI_MLDB/pima-indians-diabetes/pima-indians-diabetes.data"); 00226 if(normalize) 00227 { 00228 Mat datainput = data.subMatColumns(0,data.width()-1); 00229 normalizeDataSet(datainput); 00230 } 00231 shuffleRows(data); 00232 return VMat(data); 00233 } 00234 00235 VMat loadHousing(bool normalize) 00236 { 00237 Mat data; 00238 loadGnuplot(dbdir_name + "/UCI_MLDB/housing/housing.data", data); 00239 Mat inputs = data.subMatColumns(0,13); 00240 Mat targets = data.subMatColumns(13,1); 00241 if (normalize) 00242 { 00243 // normalize the inputs 00244 normalizeDataSet(inputs); 00245 // put the targets in a nicer range by dividing by 100 00246 targets *= real(0.01); 00247 } 00248 return VMat(data); 00249 } 00250 00251 VMat loadSonar() 00252 { 00253 Mat data = loadUCIMLDB(dbdir_name + "/UCI_MLDB/undocumented/connectionist-bench/sonar/sonar.all-data"); 00254 shuffleRows(data); 00255 // no need to normalize 00256 return VMat(data); 00257 } 00258 00259 VMat loadIonosphere() 00260 { 00261 Mat data = loadUCIMLDB(dbdir_name + "/UCI_MLDB/ionosphere/ionosphere.data"); 00262 shuffleRows(data); 00263 // no need to normalize 00264 return VMat(data); 00265 } 00266 00267 VMat loadDiabetes(bool normalize) 00268 { 00269 Mat data; 00270 loadAscii(dbdir_name + "/Diabetes/diabetes.amat",data); 00271 00272 if(normalize) 00273 { 00274 Mat datainput = data.subMatColumns(0,data.width()-1); 00275 normalizeDataSet(datainput); 00276 } 00277 shuffleRows(data); 00278 return VMat(data); 00279 } 00280 00281 int loadDiabetes(VMat& training_set, VMat& validation_set, VMat& test_set, int ntrain, int nvalid) 00282 { 00283 Mat data; 00284 loadAscii(dbdir_name + "/Diabetes/diabetes.amat",data); 00285 00286 shuffleRows(data); 00287 00288 // split the data into training_data and test_data 00289 int ntest = data.length()-(ntrain+nvalid); 00290 Mat training_data = data.subMatRows(0,ntrain); 00291 Mat validation_data = data.subMatRows(ntrain, nvalid); 00292 Mat test_data = data.subMatRows(ntrain+nvalid,ntest); 00293 00294 // normalize the inputs 00295 normalizeDataSets(training_data,validation_data,test_data); 00296 00297 training_set = VMat(training_data); 00298 validation_set = VMat(validation_data); 00299 test_set = VMat(test_data); 00300 return 2; // 2 classes 00301 } 00302 00303 int loadATT800(VMat& training_set, VMat& test_set) 00304 { 00305 Mat data; 00306 loadAscii(dbdir_name + "/ATT800/att800.amat",data); 00307 00308 // preprocessing the data: 00309 Mat durations = data.subMatColumns(0,12); 00310 Mat daytimes = data.subMatColumns(12,24); 00311 Mat classnums = data.column(36); 00312 00313 Mat newdata(data.length(), data.width()+2); 00314 Mat new_total_durations = newdata.column(0); 00315 Mat new_durations = newdata.subMatColumns(1,12); 00316 Mat new_total_daytimes = newdata.column(13); 00317 Mat new_daytimes = newdata.subMatColumns(14,24); 00318 Mat new_classnums = newdata.column(38); 00319 00320 new_durations << durations; 00321 new_daytimes << daytimes; 00322 new_classnums << classnums; 00323 for(int i=0; i<data.length(); i++) 00324 { 00325 new_total_durations(i,0) = sum(new_durations(i)); 00326 if(new_total_durations(i,0) > 0.0) 00327 { 00328 Vec new_durations_i = new_durations(i); 00329 new_durations_i /= new_total_durations(i,0); 00330 } 00331 new_total_daytimes(i,0) = sum(new_daytimes(i)); 00332 if(new_total_daytimes(i,0) > 0.0) 00333 { 00334 Vec new_daytimes_i = new_daytimes(i); 00335 new_daytimes_i /= new_total_daytimes(i,0); 00336 } 00337 } 00338 00339 shuffleRows(newdata); 00340 Mat training_data = newdata.subMatRows(0,400); 00341 Mat test_data = newdata.subMatRows(100,185); 00342 00343 // normalize the new inputs... 00344 normalizeDataSets(training_data,test_data); 00345 00346 training_set = VMat(training_data); 00347 test_set = VMat(test_data); 00348 return 2; // 2 classes 00349 } 00350 00351 VMat loadLetters(bool normalize) 00352 { 00353 Mat letters; 00354 loadAscii(dbdir_name + "/Letter/letter.amat",letters); 00355 00356 if(normalize) 00357 { 00358 Mat datainput = letters.subMatColumns(0,letters.width()-1); 00359 normalizeDataSet(datainput); 00360 } 00361 00362 return VMat(letters); 00363 } 00364 00365 00366 VMat loadLetters(const char* class0, const char* class1, bool normalize) 00367 { 00368 int letter_classnum[26]; 00369 for(int l=0; l<26; l++) 00370 letter_classnum[l] = -1; 00371 for(unsigned int i=0; i<strlen(class0); i++) 00372 letter_classnum[class0[i]-'A'] = 0; 00373 for(unsigned int i=0; i<strlen(class1); i++) 00374 letter_classnum[class1[i]-'A'] = 1; 00375 00376 Mat letters; 00377 loadAscii(dbdir_name + "/Letter/letter.amat",letters); 00378 00379 int nkeptsamples = 0; 00380 for(int i=0; i<letters.length(); i++) 00381 if(letter_classnum[int(letters(i,letters.width()-1))] >= 0) 00382 nkeptsamples++; 00383 00384 Mat keptletters(nkeptsamples, letters.width()); 00385 int n = 0; 00386 for(int i=0; i<letters.length(); i++) 00387 { 00388 int classnum = letter_classnum[int(letters(i,letters.width()-1))]; 00389 if(classnum >= 0) 00390 { 00391 keptletters(n) << letters(i); 00392 keptletters(n,keptletters.width()-1) = classnum; 00393 n++; 00394 } 00395 } 00396 00397 if(normalize) 00398 { 00399 Mat datainput = keptletters.subMatColumns(0,keptletters.width()-1); 00400 normalizeDataSet(datainput); 00401 } 00402 00403 return VMat(keptletters); 00404 } 00405 00406 int loadLetters(VMat& training_set, VMat& validation_set, VMat& test_set, char* which_letters, real validation_fraction, real test_fraction, bool do_shuffle) 00407 { 00408 int letter_classnum[26]; 00409 for(int l=0; l<26; l++) 00410 letter_classnum[l] = -1; 00411 int classnum = 0; 00412 for(unsigned int i=0; i<strlen(which_letters); i++) 00413 letter_classnum[which_letters[i]-'A'] = classnum++; 00414 00415 Mat letters; 00416 loadAscii(dbdir_name + "/Letter/letter.amat",letters); 00417 00418 Mat keptletters(letters.length(),letters.width()); 00419 int k=0; 00420 for(int i=0; i<letters.length(); i++) 00421 { 00422 int c = letter_classnum[(int)letters(i,letters.width()-1)]; 00423 if(c!=-1) 00424 { 00425 keptletters(k) << letters(i); 00426 keptletters(k,keptletters.width()-1) = c; 00427 k++; 00428 } 00429 } 00430 keptletters.resize(k,letters.width()); 00431 00432 letters = keptletters.copy(); 00433 00434 // free memory used by keptletters 00435 keptletters = Mat(); 00436 if (do_shuffle){ 00437 shuffleRows(letters); 00438 } 00439 int nvalid = int((real)letters.length()*validation_fraction); 00440 int ntest = int((real)letters.length()*test_fraction); 00441 int ntrain = letters.length()-(nvalid+ntest); 00442 00443 Mat training_data = letters.subMatRows(0,ntrain); 00444 Mat validation_data = letters.subMatRows(ntrain, nvalid); 00445 Mat test_data = letters.subMatRows(ntrain+nvalid,ntest); 00446 00447 // normalize the inputs 00448 normalizeDataSets(training_data,validation_data,test_data); 00449 00450 training_set = VMat(training_data); 00451 validation_set = VMat(validation_data); 00452 test_set = VMat(test_data); 00453 return strlen(which_letters); 00454 } 00455 VMat loadLetters(int n_letters, bool do_shuffle) 00456 { 00457 if (n_letters > 26 || n_letters < 1) 00458 PLERROR("In loadLetters: alphabet is at most 26 letters (and at least 1 letter)!"); 00459 int letter_classnum[26]; 00460 for(int l=0; l<26; l++) 00461 letter_classnum[l] = -1; 00462 int classnum = 0; 00463 int letter = 0; 00464 for(int i=0; i<n_letters; i++) 00465 letter_classnum[letter++] = classnum++; 00466 00467 Mat letters; 00468 loadAscii(dbdir_name + "/Letter/letter.amat",letters); 00469 00470 Mat keptletters(letters.length(),letters.width()); 00471 int k=0; 00472 for(int i=0; i<letters.length(); i++) 00473 { 00474 int c = letter_classnum[(int)letters(i,letters.width()-1)]; 00475 if(c!=-1) 00476 { 00477 keptletters(k) << letters(i); 00478 keptletters(k,keptletters.width()-1) = c; 00479 k++; 00480 } 00481 } 00482 keptletters.resize(k,letters.width()); 00483 00484 letters = keptletters.copy(); 00485 00486 // free memory used by keptletters 00487 keptletters = Mat(); 00488 if (do_shuffle){ 00489 shuffleRows(letters); 00490 } 00491 return VMat(letters); 00492 } 00493 00494 int loadLetters(VMat& training_set, VMat& validation_set, VMat& test_set, int n_letters, real validation_fraction, real test_fraction, bool do_shuffle) 00495 { 00496 VMat letters=loadLetters(n_letters,do_shuffle); 00497 int nvalid = int((real)letters.length()*validation_fraction); 00498 int ntest = int((real)letters.length()*test_fraction); 00499 int ntrain = letters.length()-(nvalid+ntest); 00500 00501 Mat training_data = letters.subMatRows(0,ntrain); 00502 Mat validation_data = letters.subMatRows(ntrain, nvalid); 00503 Mat test_data = letters.subMatRows(ntrain+nvalid,ntest); 00504 00505 // normalize the inputs 00506 normalizeDataSets(training_data,validation_data,test_data); 00507 00508 training_set = VMat(training_data); 00509 validation_set = VMat(validation_data); 00510 test_set = VMat(test_data); 00511 return n_letters; 00512 } 00513 00514 void loadCorelDatamat(int classnum, Mat& train, Mat& valid, Mat& test) 00515 { 00516 char filename[1000]; 00517 int len; 00518 int width = 16*16*16*2; 00519 00520 // Load train 00521 { 00522 sprintf(filename,(dbdir_name + "/Corel/train/size%d").c_str(),classnum); 00523 ifstream sizein(filename); 00524 sizein >> len; 00525 Mat datamat(len, width); 00526 00527 sprintf(filename,(dbdir_name + "/Corel/train/histo%d").c_str(),classnum); 00528 ifstream datain(filename); 00529 #ifdef USEFLOAT 00530 datain.read((char*)datamat.data(), len*width*4); 00531 #ifdef LITTLEENDIAN 00532 reverse_float(datamat.data(), len*width); 00533 #endif 00534 #else 00535 PLERROR("In loadCorelDatamat USEDOUBLE case not yet implemented correctly"); 00536 #endif 00537 // Now copy only the useful features 00538 train.resize(len,width/2); 00539 for(int i=0; i<train.length(); i++) 00540 for(int j=0; j<train.width(); j++) 00541 train(i,j) = datamat(i,2*j); 00542 } 00543 00544 // Load valid 00545 { 00546 sprintf(filename,(dbdir_name + "/Corel/valid/size%d").c_str(),classnum); 00547 ifstream sizein(filename); 00548 sizein >> len; 00549 Mat datamat(len, width); 00550 00551 sprintf(filename,(dbdir_name + "/Corel/valid/histo%d").c_str(),classnum); 00552 ifstream datain(filename); 00553 #ifdef USEFLOAT 00554 datain.read((char*)datamat.data(), len*width*4); 00555 #ifdef BIGENDIAN 00556 reverse_float(datamat.data(), len*width); 00557 #endif 00558 #else 00559 PLERROR("In loadCorelDatamat USEDOUBLE case not yet implemented correctly"); 00560 #endif 00561 00562 // Now copy only the useful features 00563 valid.resize(len,width/2); 00564 for(int i=0; i<valid.length(); i++) 00565 for(int j=0; j<valid.width(); j++) 00566 valid(i,j) = datamat(i,2*j); 00567 } 00568 00569 // Load test 00570 { 00571 sprintf(filename,(dbdir_name + "/Corel/test/size%d").c_str(),classnum); 00572 ifstream sizein(filename); 00573 sizein >> len; 00574 Mat datamat(len, width); 00575 00576 sprintf(filename,(dbdir_name + "/Corel/test/histo%d").c_str(),classnum); 00577 ifstream datain(filename); 00578 #ifdef USEFLOAT 00579 datain.read((char*)datamat.data(), len*width*4); 00580 #ifdef BIGENDIAN 00581 reverse_float(datamat.data(), len*width); 00582 #endif 00583 #else 00584 PLERROR("In loadCorelDatamat USEDOUBLE case not yet implemented correctly"); 00585 #endif 00586 00587 // Now copy only the useful features 00588 test.resize(len,width/2); 00589 for(int i=0; i<test.length(); i++) 00590 for(int j=0; j<test.width(); j++) 00591 test(i,j) = datamat(i,2*j); 00592 } 00593 } 00594 00595 Mat smoothCorelHisto(Mat& data) 00596 { 00597 Mat res(data.length(), 7*7*7); 00598 for(int n=0; n<data.length(); n++) 00599 { 00600 real* r = res[n]; 00601 real* d = data[n]; 00602 for(int i=0; i<7; i++) 00603 for(int j=0; j<7; j++) 00604 for(int k=0; k<7; k++,r++) 00605 { 00606 *r += 0.15*d[i*2*16*16+j*2*16+k*2]; 00607 *r += 0.35*d[(i*2+1)*16*16+(j*2+1)*16+k*2+1]; 00608 *r += 0.35*d[(i*2+2)*16*16+(j*2+2)*16+k*2+2]; 00609 *r += 0.15*d[(i*2+3)*16*16+(j*2+3)*16+k*2+3]; 00610 } 00611 } 00612 return res; 00613 } 00614 00615 void loadCorel(Mat& training_set, Mat& validation_set, Mat& test_set, int negative_class, int positive_class) 00616 { 00617 // A is the negative class (will have 0 classnums) 00618 // B is the positive class (will have 1 classnums) 00619 00620 Mat trainA, validA, testA; 00621 Mat trainB, validB, testB; 00622 00623 loadCorelDatamat(negative_class, trainA, validA, testA); 00624 trainA = smoothCorelHisto(trainA); 00625 validA = smoothCorelHisto(validA); 00626 testA = smoothCorelHisto(testA); 00627 loadCorelDatamat(positive_class, trainB, validB, testB); 00628 trainB = smoothCorelHisto(trainB); 00629 validB = smoothCorelHisto(validB); 00630 testB = smoothCorelHisto(testB); 00631 int inputsize = trainA.width(); 00632 00633 training_set.resize(trainA.length()+trainB.length(), inputsize+1); 00634 Mat trainingAinputs = training_set.subMat(0, 0, trainA.length(), inputsize); 00635 Mat trainingAclassnums = training_set.subMat(0, inputsize, trainA.length(), 1); 00636 Mat trainingBinputs = training_set.subMat(trainA.length(), 0, trainB.length(), inputsize); 00637 Mat trainingBclassnums = training_set.subMat(trainA.length(), inputsize, trainB.length(), 1); 00638 trainingAinputs << trainA; 00639 trainingAclassnums.fill(0.0); 00640 trainingBinputs << trainB; 00641 trainingBclassnums.fill(1.0); 00642 shuffleRows(training_set); 00643 00644 validation_set.resize(validA.length()+validB.length(), inputsize+1); 00645 Mat validAinputs = validation_set.subMat(0, 0, validA.length(), inputsize); 00646 Mat validAclassnums = validation_set.subMat(0, inputsize, validA.length(), 1); 00647 Mat validBinputs = validation_set.subMat(validA.length(), 0, validB.length(), inputsize); 00648 Mat validBclassnums = validation_set.subMat(validA.length(), inputsize, validB.length(), 1); 00649 validAinputs << validA; 00650 validAclassnums.fill(0.0); 00651 validBinputs << validB; 00652 validBclassnums.fill(1.0); 00653 shuffleRows(validation_set); 00654 00655 test_set.resize(testA.length()+testB.length(), inputsize+1); 00656 Mat testAinputs = test_set.subMat(0, 0, testA.length(), inputsize); 00657 Mat testAclassnums = test_set.subMat(0, inputsize, testA.length(), 1); 00658 Mat testBinputs = test_set.subMat(testA.length(), 0, testB.length(), inputsize); 00659 Mat testBclassnums = test_set.subMat(testA.length(), inputsize, testB.length(), 1); 00660 testAinputs << testA; 00661 testAclassnums.fill(0.0); 00662 testBinputs << testB; 00663 testBclassnums.fill(1.0); 00664 shuffleRows(test_set); 00665 } 00666 00667 void loadCallxx(int year, VMat& d) 00668 { 00669 Mat data; 00670 char filename[1000]; 00671 sprintf(filename,(dbdir_name + "/Finance/call%d.stc.data").c_str(),year); 00672 loadAscii(filename, data); 00673 d = VMat(data); 00674 } 00675 00676 00677 void loadUSPS(VMat& trainset, VMat& testset, bool use_smooth) 00678 { 00679 Mat traininputs; 00680 Mat testinputs; 00681 Mat traindesired; 00682 Mat testdesired; 00683 00684 if(use_smooth) 00685 { 00686 traininputs = loadSNMat(dbdir_name + "/usps/train-patterns-smoo.mat"); 00687 testinputs = loadSNMat(dbdir_name + "/usps/test-patterns-smoo.mat"); 00688 } 00689 else 00690 { 00691 traininputs = loadSNMat(dbdir_name + "/usps/ocr16-train.mat"); 00692 testinputs = loadSNMat(dbdir_name + "/usps/ocr16-test.mat"); 00693 } 00694 //traininputs += 1.0; 00695 //traininputs /= 2.0; 00696 //testinputs += 1.0; 00697 //testinputs /= 2.0; 00698 00699 traindesired = loadSNMat(dbdir_name + "/usps/train-desired.mat"); 00700 Mat trainclasses(traininputs.length(),1); 00701 for(int i=0; i<traindesired.length(); i++) 00702 trainclasses(i,0) = argmax(traindesired(i)); 00703 00704 testdesired = loadSNMat(dbdir_name + "/usps/test-desired.mat"); 00705 Mat testclasses(testinputs.length(),1); 00706 for(int i=0; i<testdesired.length(); i++) 00707 testclasses(i,0) = argmax(testdesired(i)); 00708 00709 trainset = hconcat(traininputs,trainclasses); 00710 testset = hconcat(testinputs,testclasses); 00711 } 00712 00713 VMat loadUSPS(bool use_smooth) 00714 { 00715 Mat traininputs; 00716 Mat traindesired; 00717 00718 if(use_smooth) 00719 traininputs = loadSNMat(dbdir_name + "/usps/patterns-smoo.mat"); 00720 else 00721 traininputs = loadSNMat(dbdir_name + "/usps/ocr16.pat"); 00722 00723 traininputs += real(1.0); 00724 traininputs /= real(2.0); 00725 00726 traindesired = loadSNMat(dbdir_name + "/usps/desired.mat"); 00727 Mat trainclasses(traininputs.length(),1); 00728 for(int i=0; i<traindesired.length(); i++) 00729 trainclasses(i,0) = argmax(traindesired(i)); 00730 00731 Mat trainset = hconcat(traininputs,trainclasses); 00732 00733 return trainset; 00734 } 00735 00736 void loadLetters(int& inputsize, int& nclasses, VMat& trainset, VMat& testset) 00737 { 00738 Mat letters; 00739 loadAscii(dbdir_name + "/Letter/letter.amat",letters); 00740 inputsize = letters.width()-1; 00741 nclasses = 26; 00742 trainset = VMat(letters.subMatRows(0,16000)); 00743 testset = VMat(letters.subMatRows(16000,4000)); 00744 } 00745 00746 void loadClassificationDataset(const string& datasetname, int& inputsize, int& nclasses, VMat& trainset, VMat& testset, bool normalizeinputs, VMat& allset) 00747 { 00748 string dbname = datasetname; 00749 int reduced_size = 0; 00750 vector<string> dataset_and_size = split(dbname,":"); 00751 if(dataset_and_size.size()==2) 00752 { 00753 dbname = dataset_and_size[0]; 00754 reduced_size = toint(dataset_and_size[1]); 00755 } 00756 00757 if(dbname=="2d") 00758 { 00759 trainset = input2dSet(); 00760 Mat mapping(2,2); mapping << "-1 0 1 1"; 00761 trainset = remapLastColumn(trainset,mapping); 00762 testset = trainset; 00763 inputsize = 2; 00764 nclasses = 2; 00765 } 00766 else if(dbname=="letters") 00767 { 00768 loadLetters(inputsize, nclasses, trainset, testset); 00769 } 00770 else if(dbname=="breast") 00771 { 00772 VMat dbname = loadBreastCancerWisconsin(); 00773 inputsize = dbname.width()-1; 00774 nclasses = 2; 00775 split(dbname,0.5,trainset,testset); 00776 } 00777 else if(dbname=="usps") 00778 { 00779 loadUSPS(trainset,testset,true); 00780 inputsize = trainset.width()-1; 00781 nclasses = 10; 00782 } 00783 else if(dbname=="mnist") 00784 { 00785 loadMNIST(trainset,testset); 00786 inputsize = trainset.width()-1; 00787 nclasses = 10; 00788 } 00789 else if(dbname=="mnist_override") 00790 { 00791 loadMNIST(trainset,testset); 00792 inputsize = trainset.width()-1; 00793 nclasses = 10; 00794 Mat m; 00795 m.load("mnist_override.pmat"); 00796 if(m.width() != inputsize+1) 00797 PLERROR("mnist_overrid.pmat is espected to have a width of %d, but has %d",inputsize+1,m.width()); 00798 trainset = VMat(m); 00799 } 00800 else if(dbname.length()==5 && dbname.substr(0,4)=="usps" && dbname[4]>='0' && dbname[4]<='9') 00801 { 00802 int classnum = dbname[4]-'0'; 00803 loadUSPS(trainset,testset,true); 00804 inputsize = trainset.width()-1; 00805 trainset = remapLastColumn(trainset,classnum,1,0); 00806 testset = remapLastColumn(testset,classnum,1,0); 00807 nclasses = 2; 00808 } 00809 else if(dbname.length()==5 && dbname.substr(0,4)=="mnist" && dbname[4]>='0' && dbname[4]<='9') 00810 { 00811 int classnum = dbname[4]-'0'; 00812 loadMNIST(trainset,testset); 00813 inputsize = trainset.width()-1; 00814 trainset = remapLastColumn(trainset,classnum,1.,0.); 00815 testset = remapLastColumn(testset,classnum,1.,0.); 00816 nclasses = 2; 00817 } 00818 else if (dbname.substr(0,4) == "UCI_") { 00819 string db_spec; 00820 string type; 00821 if (dbname.substr(0,8) == "UCI_KDD_") { 00822 db_spec = dbname.substr(8); 00823 type = "KDD"; 00824 } else { 00825 db_spec = dbname.substr(4); 00826 type = "MLDB"; 00827 } 00828 00829 size_t look_for_id = db_spec.rfind("_ID="); 00830 string db_dir; 00831 string id = ""; 00832 if (look_for_id != string::npos) { 00833 // There is an ID specified. 00834 db_dir = db_spec.substr(0, look_for_id); 00835 id = db_spec.substr(look_for_id + 4); 00836 } else { 00837 db_dir = db_spec; 00838 } 00839 loadUCI(trainset, testset, allset, db_dir, id, normalizeinputs,type); 00840 } 00841 else 00842 PLERROR("Unknown dbname %s",dbname.c_str()); 00843 00844 if(reduced_size) 00845 { 00846 trainset = trainset.subMatRows(0,reduced_size); 00847 testset = testset.subMatRows(0,reduced_size); 00848 } 00849 00850 if(normalizeinputs) 00851 { 00852 Vec meanvec; 00853 Vec stddevvec; 00854 computeMeanAndStddev(trainset, meanvec, stddevvec); 00855 meanvec = meanvec.subVec(0,inputsize); 00856 stddevvec = stddevvec.subVec(0,inputsize); 00857 for (int i = 0; i < stddevvec.length(); i++) { 00858 if (stddevvec[i] == 0) { 00859 // The standard dev is 0, the row must be constant. Since we don't 00860 // want nans we put 1 instead. 00861 stddevvec[i] = 1; 00862 } 00863 } 00864 for (int i=0;i<inputsize;i++) 00865 if (stddevvec[i]==0) stddevvec[i]=1; 00866 trainset = normalize(trainset,meanvec,stddevvec); 00867 testset = normalize(testset,meanvec,stddevvec); 00868 } 00869 } 00870 00871 00873 // loadUCI // 00875 void loadUCI(VMat& trainset, VMat& testset, VMat& allset, string db_spec, string id, bool &normalize, const string& type) { 00876 string script_file = db_spec; 00877 if (id != "") { 00878 script_file += "_ID=" + id; 00879 } 00880 script_file += ".plearn"; 00881 string db_dir; 00882 if (type=="MLDB") { 00883 db_dir = dbdir_name + "/UCI_MLDB/" + db_spec; 00884 } else if (type=="KDD") { 00885 db_dir = dbdir_name + "/UCI_KDD/" + db_spec; 00886 } else { 00887 PLERROR("In loadUCI: Unknown dataset type: %s.",type.c_str()); 00888 } 00889 Object* obj = PLearn::macroLoadObject(db_dir + "/" + script_file); 00890 PP<UCISpecification> uci_spec = static_cast<UCISpecification*>(obj); 00891 if (uci_spec->file_train != "") { 00892 if (uci_spec->format=="UCI") { 00893 loadUCISet(trainset, db_dir + "/" + uci_spec->file_train, uci_spec); 00894 } else if (uci_spec->format=="AMAT") { 00895 loadUCIAMat(trainset,db_dir + "/" + uci_spec->file_train, uci_spec); 00896 } else { 00897 PLERROR("In loadUCI: Format '%s' unsupported",uci_spec->format.c_str()); 00898 } 00899 } 00900 if (uci_spec->file_test != "") { 00901 if (uci_spec->format=="UCI") { 00902 loadUCISet(testset, db_dir + "/" + uci_spec->file_test, uci_spec); 00903 } else if (uci_spec->format=="AMAT") { 00904 loadUCIAMat(testset,db_dir + "/" + uci_spec->file_test, uci_spec); 00905 } else { 00906 PLERROR("In loadUCI: Format '%s' unsupported",uci_spec->format.c_str()); 00907 } 00908 } 00909 if (uci_spec->file_all != "") { 00910 if (uci_spec->format=="UCI") { 00911 loadUCISet(allset, db_dir + "/" + uci_spec->file_all, uci_spec); 00912 } else if (uci_spec->format=="AMAT") { 00913 loadUCIAMat(allset, db_dir + "/" + uci_spec->file_all, uci_spec); 00914 } else { 00915 PLERROR("In loadUCI: Format '%s' unsupported",uci_spec->format.c_str()); 00916 } 00917 } else { 00918 allset = vconcat(trainset, testset); 00919 } 00920 if (normalize) { 00921 int is = uci_spec->inputsize; 00922 if (is == -1) 00923 is = allset->width() - 1; 00924 VMat tmp_vmat = new ShiftAndRescaleVMatrix(allset, is, 0, true, 0); 00925 Mat new_data = tmp_vmat->toMat().subMatColumns(0, is); 00926 allset->putMat(0, 0, new_data); 00927 if (trainset && testset) { 00928 if (allset->length() != trainset->length() + testset->length()) 00929 PLERROR("In loadUCI - The whole dataset should have a length equal to train + test"); 00930 trainset->putMat(0, 0, new_data.subMatRows(0, trainset->length())); 00931 testset->putMat(0, 0, new_data.subMatRows(trainset->length(), testset->length())); 00932 } else if (trainset || testset) { 00933 PLERROR("In loadUCI - There can't be only a train set or only a test set"); 00934 } 00935 // We don't want to normalize again. 00936 normalize = false; 00937 } 00938 } 00939 00940 00941 00943 // loadUCIAMat // 00945 void loadUCIAMat(VMat& data, string file, PP<UCISpecification> uci_spec) 00946 { 00947 data = loadAsciiAsVMat(file); 00948 00949 if (uci_spec->target_is_first) { 00950 // We need to move the target to the last columns. 00951 int ts = uci_spec->targetsize; 00952 if (ts == -1) { 00953 PLERROR("In loadUCIAMat - We don't know how many columns to move"); 00954 } 00955 if (uci_spec->weightsize > 0) { 00956 PLERROR("In loadUCIAMat - Damnit, I don't like weights"); 00957 } 00958 Vec row; 00959 Vec target; 00960 00961 target.resize(ts); 00962 for (int i = 0; i < data.length(); i++) { 00963 row = data(i); 00964 target << row.subVec(0,ts); 00965 row.subVec(0, data.width() - ts ) << row.subVec(ts, data.width() - ts); 00966 row.subVec(data.width() - ts , ts) << target; 00967 data->putRow(i,row); 00968 } 00969 00970 // now, move the symbols 00971 TVec<map<string,real> > sym; 00972 int is = data.width()-ts; 00973 sym.resize(ts); 00974 for (int i=0;i<ts;i++) { 00975 sym[i] = data->getStringToRealMapping(i); 00976 } 00977 for(int i=0;i<is; i++) { 00978 data->setStringMapping(i, data->getStringToRealMapping(i+ts)); 00979 } 00980 for(int i=is;i<is+ts;i++) { 00981 data->setStringMapping(i,sym[i-is]); 00982 } 00983 00984 00985 } 00986 } 00987 00989 // loadUCISet // 00991 void loadUCISet(VMat& data, string file, PP<UCISpecification> uci_spec) { 00992 char *** to_symbols; 00993 int * to_n_symbols; 00994 TVec<int> max_in_col; 00995 TVec<string> header_columns; 00996 Mat the_data; 00997 if (uci_spec->header_exists) { 00998 the_data = loadUCIMLDB(file, &to_symbols, &to_n_symbols, &max_in_col,&header_columns); 00999 } else { 01000 the_data = loadUCIMLDB(file, &to_symbols, &to_n_symbols, &max_in_col); 01001 } 01002 if (uci_spec->target_is_first) { 01003 // We need to move the target to the last columns. 01004 int ts = uci_spec->targetsize; 01005 if (ts == -1) { 01006 PLERROR("In loadUCISet - We don't know how many columns to move"); 01007 } 01008 if (uci_spec->weightsize > 0) { 01009 PLERROR("In loadUCISet - Damnit, I don't like weights"); 01010 } 01011 Vec row; 01012 Vec target; 01013 01014 target.resize(ts); 01015 for (int i = 0; i < the_data.length(); i++) { 01016 row = the_data(i); 01017 target << row.subVec(0,ts); 01018 row.subVec(0, the_data.width() - ts ) << row.subVec(ts, the_data.width() - ts); 01019 row.subVec(the_data.width() - ts , ts) << target; 01020 } 01021 } 01022 data = VMat(the_data); 01023 data->defineSizes(uci_spec->inputsize, uci_spec->targetsize, uci_spec->weightsize); 01024 01025 if (uci_spec->header_exists) { 01026 if (uci_spec->header_fields.size()==0) { 01027 01028 if (uci_spec->target_is_first) { 01029 int ts = uci_spec->targetsize; 01030 int is = the_data.width()-ts; 01031 TVec<string> tmp; 01032 tmp.resize(ts); 01033 tmp << header_columns.subVec(0,ts); 01034 header_columns.subVec(0,is) << header_columns.subVec(ts,is); 01035 header_columns.subVec(is,ts) << tmp; 01036 } 01037 data->declareFieldNames(header_columns); 01038 } else { 01039 TVec<string> field_names; 01040 field_names.resize(the_data.width()); 01041 int last = 0; 01042 int cnt=0; 01043 for (int i=0; i<uci_spec->header_fields.size(); i++) { 01044 for (int j=last;j<uci_spec->header_fields[i].first;j++) { 01045 field_names[j] = ""; 01046 } 01047 for (int j=uci_spec->header_fields[i].first;j<=uci_spec->header_fields[i].second;j++) { 01048 if (cnt>=header_columns.size()) { 01049 PLERROR("In loadUCISet: 'header_fields' setting is incorrect"); 01050 } 01051 field_names[j] = header_columns[cnt++]; 01052 } 01053 last = uci_spec->header_fields[i].second+1; 01054 } 01055 for (int i=last;i<field_names.size();i++) { 01056 field_names[i] = ""; 01057 } 01058 if (uci_spec->target_is_first) { 01059 int ts = uci_spec->targetsize; 01060 int is = the_data.width()-ts; 01061 TVec<string> tmp; 01062 tmp.resize(ts); 01063 tmp << field_names.subVec(0,ts); 01064 field_names.subVec(0,is) << field_names.subVec(ts,is); 01065 field_names.subVec(is,ts) << tmp; 01066 } 01067 data->declareFieldNames(field_names); 01068 } 01069 } 01070 01071 // Add symbol mappings 01072 01073 if (uci_spec->target_is_first) { 01074 int ts = uci_spec->targetsize; 01075 int is = the_data.width()-ts; 01076 TVec<char**> tmp_sym(ts); 01077 TVec<int> tmp_len(ts); 01078 for(int i=0;i<ts;i++) { 01079 tmp_sym[i] = to_symbols[i]; 01080 tmp_len[i] = to_n_symbols[i]; 01081 } 01082 for (int i=ts;i<is+ts;i++) { 01083 to_symbols[i-ts] = to_symbols[i]; 01084 to_n_symbols[i-ts] = to_n_symbols[i]; 01085 } 01086 for(int i=is;i<is+ts;i++) { 01087 to_symbols[i] = tmp_sym[i-is]; 01088 to_n_symbols[i] = tmp_len[i-is]; 01089 } 01090 01091 tmp_len << max_in_col.subVec(0,ts); 01092 max_in_col.subVec(0,is) << max_in_col.subVec(ts,is); 01093 max_in_col.subVec(is,ts) << tmp_len; 01094 } 01095 for (int j=0;j<data->width();j++) { 01096 for (int k=0;k<to_n_symbols[j];k++) { 01097 data->addStringMapping(j,string(to_symbols[j][k]),real(max_in_col[j]+k+1)); 01098 } 01099 } 01100 01101 // Free up the symbols 01102 for (int i=0; i<data->width(); i++) 01103 { 01104 for (int j=0; j<to_n_symbols[i]; j++) 01105 free(to_symbols[i][j]); 01106 free(to_symbols[i]); 01107 } 01108 free(to_symbols); 01109 free(to_n_symbols); 01110 } 01111 01112 } // end of namespace PLearn

Generated on Tue Aug 17 15:50:55 2004 for PLearn by doxygen 1.3.7