Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

MatIO.h

Go to the documentation of this file.
00001 // -*- C++ -*- 00002 00003 // PLearn (A C++ Machine Learning Library) 00004 // Copyright (C) 1998 Pascal Vincent 00005 // Copyright (C) 1999-2002 Pascal Vincent, Yoshua Bengio and University of Montreal 00006 // 00007 00008 // Redistribution and use in source and binary forms, with or without 00009 // modification, are permitted provided that the following conditions are met: 00010 // 00011 // 1. Redistributions of source code must retain the above copyright 00012 // notice, this list of conditions and the following disclaimer. 00013 // 00014 // 2. Redistributions in binary form must reproduce the above copyright 00015 // notice, this list of conditions and the following disclaimer in the 00016 // documentation and/or other materials provided with the distribution. 00017 // 00018 // 3. The name of the authors may not be used to endorse or promote 00019 // products derived from this software without specific prior written 00020 // permission. 00021 // 00022 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00023 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00024 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00025 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00026 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00027 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00028 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00029 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00030 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00031 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00032 // 00033 // This file is part of the PLearn library. For more information on the PLearn 00034 // library, go to the PLearn Web site at www.plearn.org 00035 00036 00037 00038 00039 /* ******************************************************* 00040 * $Id: MatIO.h,v 1.21 2004/08/09 16:16:08 tihocan Exp $ 00041 * This file is part of the PLearn library. 00042 ******************************************************* */ 00043 00044 #ifndef MatIO_INC 00045 #define MatIO_INC 00046 00047 #include <plearn/math/TMat.h> 00048 #include "fileutils.h" 00049 #include <stdlib.h> 00050 #include <plearn/base/stringutils.h> 00051 00052 namespace PLearn { 00053 using namespace std; 00054 00055 00058 inline Mat makeMat(int length, int width, const string& values) 00059 { Mat m(length,width); m << values; return m; } 00060 00061 inline Vec makeVec(int length, const string& values) 00062 { Vec v(length); v << values; return v; } 00063 00066 void loadMat(const string& filename, TMat<float>& mat); 00067 void loadMat(const string& filename, TMat<double>& mat); 00068 void loadVec(const string& filename, TVec<float>& vec); 00069 void loadVec(const string& filename, TVec<double>& vec); 00070 00072 void savePVec(const string& filename, const TVec<float>& vec); 00073 void savePVec(const string& filename, const TVec<double>& vec); 00074 void loadPVec(const string& filename, TVec<float>& vec); 00075 void loadPVec(const string& filename, TVec<double>& vec); 00076 void savePMat(const string& filename, const TMat<float>& mat); 00077 void savePMat(const string& filename, const TMat<double>& mat); 00078 void loadPMat(const string& filename, TMat<float>& mat); 00079 void loadPMat(const string& filename, TMat<double>& mat); 00080 00082 00084 template<class T> void loadAscii(const string& filename, TMat<T>& mat, TVec<string>& fieldnames, TVec<map<string,real> >* map_sr = 0); 00085 template<class T> void loadAscii(const string& filename, TMat<T>& mat); 00086 00087 void parseSizeFromRemainingLines(const string& filename, ifstream& in, bool& could_be_old_amat, int& length, int& width); 00088 00089 // norman: added another function to solve the internal compiler error of .NET when using 00090 // default parameter with templates. See old declaration: 00091 //template<class T> void saveAscii(const string& filename, const TMat<T>& mat, 00092 // const TVec<string>& fieldnames = TVec<string>() ); 00093 template<class T> void saveAscii(const string& filename, const TMat<T>& mat, 00094 const TVec<string>& fieldnames); 00095 template<class T> void saveAscii(const string& filename, const TMat<T>& mat); 00096 00098 template<class T> void saveAscii(const string& filename, const TVec<T>& vec); 00099 template<class T> void loadAscii(const string& filename, TVec<T>& vec); 00100 00102 void loadGnuplot(const string& filename, Mat& mat); 00103 void saveGnuplot(const string& filename, const Vec& vec); 00104 void saveGnuplot(const string& filename, const Mat& mat); 00105 00107 // Format readable by matlab 00108 00113 void matlabSave( const string& dir, const string& plot_title, const Vec& data, 00114 const Vec& add_col, const Vec& bounds, string lengend="", bool save_plot=true); 00115 void matlabSave( const string& dir, const string& plot_title, 00116 const Vec& xValues, 00117 const Vec& yValues, const Vec& add_col, const Vec& bounds, string lengend="", bool save_plot=true); 00118 00120 void matlabSave( const string& dir, const string& plot_title, const Mat& data, 00121 const Vec& add_col, const Vec& bounds, TVec<string> legend=TVec<string>(), bool save_plot=true); 00122 00131 void matlabSave( const string& dir, const string& plot_title, 00132 const Vec& xValues, 00133 const Mat& yValues, const Vec& add_col, const Vec& bounds, TVec<string> legend=TVec<string>(), bool save_plot=true); 00135 00137 void loadAsciiWithoutSize(const string& filename, const Vec& vec); 00138 void saveAsciiWithoutSize(const string& filename, const Vec& vec); 00139 void loadAsciiWithoutSize(const string& filename, const Mat& mat); 00140 void saveAsciiWithoutSize(const string& filename, const Mat& mat); 00141 00143 Mat loadSNMat(const string& filename); 00144 Vec loadSNVec(const string& filename); 00145 void saveSNMat(const string& filename, const Mat& mat); 00146 void saveSNVec(const string& filename, const Vec& vec); 00147 00149 Mat loadADMat(const string& filename); 00150 Vec loadADVec(const string& filename); 00151 00167 Mat loadUCIMLDB(const string& filename, char ****to_symbols=0, int **to_n_symbols=0, TVec<int>* max_in_col = 0, TVec<string>* header_columns = 0); 00168 00179 Mat loadSTATLOG(const string& filename, char ****to_symbols=0, int **to_n_symbols=0); 00180 00189 void loadJPEGrgb(const string& jpeg_filename, Mat& rgbmat, int& row_size, int scale = 1); 00190 00191 00192 // Intelligent function that will load a file in almost all ascii formats that ever existed in this lab. 00193 // Additionally, if 'map_sr' is provided, it will fill it with the string -> real mappings encountered. 00194 template<class T> 00195 void loadAscii(const string& filename, TMat<T>& mat, TVec<string>& fieldnames, TVec<map<string,real> >* map_sr = 0) 00196 { 00197 ifstream in(filename.c_str()); 00198 if(!in) 00199 PLERROR("Could not open file %s for reading", filename.c_str()); 00200 00201 int length = -1; 00202 int width = -1; 00203 bool could_be_old_amat=true; // true while there is still a chance that this be an "old" amat format (length and width in first row with no starting ##) 00204 00205 in >> ws; 00206 string line; 00207 00208 while(in.peek()=='#') 00209 { 00210 getline(in, line); 00211 could_be_old_amat = false; 00212 00213 size_t pos=line.find(":"); 00214 if(pos!=string::npos) 00215 { 00216 string sub=line.substr(0,pos); 00217 if(sub=="#size") // we've found the dimension specification line 00218 { 00219 string siz=removeblanks((line.substr(pos)).substr(1)); 00220 vector<string> dim = split(siz," "); 00221 if(dim.size()!=2) PLERROR("I need exactly 2 dimensions for matrix"); 00222 length = toint(dim[0]); 00223 width = toint(dim[1]); 00224 } 00225 else if(sub=="#") // we've found the fieldnames specification line 00226 { 00227 string fnl=line.substr(pos).substr(1); 00228 fieldnames = split(fnl," "); 00229 width=fieldnames.size(); 00230 } 00231 } 00232 in >> ws; 00233 } 00234 00235 if(length==-1) // still looking for size info... 00236 parseSizeFromRemainingLines(filename, in, could_be_old_amat, length, width); 00237 00238 if(length==-1) 00239 PLERROR("In loadAscii: trying to load but couldn't determine file format automatically for %s",filename.c_str()); 00240 00241 if(width != -1 && width != fieldnames.length()) 00242 { 00243 if (fieldnames.length() != 0) 00244 PLWARNING("In loadAscii: Number of fieldnames (%d) and width (%d) mismatch in file %s. " 00245 "Replacing fieldnames by 'Field-0', 'Field-1', ...", 00246 fieldnames.length(), width, filename.c_str()); 00247 fieldnames.resize(width); 00248 for(int i= 0; i < width; ++i) 00249 fieldnames[i]= string("Field-") + tostring(i); 00250 } 00251 00252 // We are now more careful about the possibility of the stream being in a 00253 // bad state. The sequel in.seekg(0); in.clear(); did not seem to do the job. 00254 in.close(); 00255 ifstream loadmat(filename.c_str()); 00256 00257 mat.resize(length,width); 00258 TVec<int> current_map(width); 00259 current_map.fill(1001); // The value of the string mapping we start with. 00260 TVec<T> current_max(width); // The max of the numerical values in each column. 00261 current_max.clear(); 00262 // Initialize the mappings to empty mappings. 00263 if (map_sr) { 00264 map_sr->resize(width); 00265 } 00266 string inp_element; 00267 for(int i=0; i<length; i++) 00268 { 00269 T* mat_i = mat[i]; 00270 skipBlanksAndComments(loadmat); 00271 for(int j=0; j<width; j++) { 00272 // C99 strtod handles NAN's and INF's. 00273 if (loadmat) { 00274 loadmat >> inp_element; 00275 if (pl_isnumber(inp_element)) { 00276 mat_i[j] = strtod(inp_element.c_str(), 0); 00277 if (map_sr) { 00278 T val = mat_i[j]; 00279 // We need to make sure that this number does not conflict 00280 // with a string mapping. 00281 if (val > current_max[j]) 00282 current_max[j] = val; 00283 if (current_max[j] >= current_map[j]) 00284 current_map[j] = int(current_max[j] + 1); 00285 map<string,real>& m = (*map_sr)[j]; 00286 for (map<string,real>::iterator it = m.begin(); it != m.end(); it++) { 00287 if (it->second == val) { 00288 // We're screwed, there is currently a mapping between a string 00289 // and this numeric value. We have to change it. 00290 // We pick either the next string mapping value, or the current 00291 // max in the column (+ 1) if it is larger. 00292 int cur_max_plus_one = int(real(current_max[j]) + 1); 00293 if (cur_max_plus_one > current_map[j]) { 00294 it->second = cur_max_plus_one; 00295 current_map[j] = cur_max_plus_one; 00296 } else 00297 it->second = current_map[j]; 00298 current_map[j]++; 00299 // In addition, we have to modify all previous data, which sucks. 00300 for (int k = 0; k < i; k++) { 00301 if (mat(k, j) == val) 00302 mat(k, j) = it->second; 00303 } 00304 } 00305 } 00306 } 00307 } else { 00308 // This is a string! 00309 if (map_sr) { 00310 // Already encountered ? 00311 map<string,real>& m = (*map_sr)[j]; 00312 map<string,real>::iterator it = m.find(inp_element); 00313 if(it != m.end()) { 00314 // It already exists in the map. 00315 mat_i[j] = it->second; 00316 } else { 00317 // We need to add it. 00318 (*map_sr)[j][inp_element] = current_map[j]; 00319 mat_i[j] = current_map[j]; 00320 current_map[j]++; 00321 } 00322 } else 00323 PLERROR("In loadAscii - You need to provide 'map_sr' if you want to load an ASCII file with strings"); 00324 } 00325 } 00326 if (!loadmat) { 00327 loadmat.clear(); 00328 mat_i[j] = MISSING_VALUE; 00329 } 00330 } 00331 } 00332 } 00333 00340 template<class T> 00341 void loadAsciiSingleBinaryDescriptor(const string& filename, TMat<T>& mat) 00342 { 00343 ifstream in(filename.c_str()); 00344 if(!in) 00345 PLERROR("In loadAsciiSingleBinaryDescriptor: Could not open file %s for reading", filename.c_str()); 00346 00347 int length = -1; 00348 int width = -1; 00349 00350 in >> ws; 00351 string line; 00352 00353 while(in.peek()=='#') 00354 { 00355 getline(in, line); 00356 00357 size_t pos=line.find(":"); 00358 if(pos!=string::npos) 00359 { 00360 string sub=line.substr(0,pos); 00361 if(sub=="#size") // we've found the dimension specification line 00362 { 00363 string siz=removeblanks((line.substr(pos)).substr(1)); 00364 vector<string> dim = split(siz," "); 00365 if(dim.size()!=2) PLERROR("In loadAsciiSingleBinaryDescriptor: I need exactly 2 dimensions for matrix"); 00366 length = toint(dim[0]); 00367 width = toint(dim[1]); 00368 } 00369 } 00370 in >> ws; 00371 } 00372 00373 if(length==-1) // still looking for size info... 00374 { 00375 PLERROR("In loadAsciiSingleBinaryDescriptor: Be nice and specify a width and length"); 00376 } 00377 00378 // We are now more careful about the possibility of the stream being in a 00379 // bad state. 00380 mat.resize(length,width); 00381 string inp_element; 00382 for(int i=0; i<length; i++) 00383 { 00384 T* mat_i = mat[i]; 00385 skipBlanksAndComments(in); 00386 in >> inp_element; // Read the entry name. 00387 in >> inp_element; // Read the binary descriptor. 00388 if (inp_element.length() != (unsigned int) width) { 00389 PLERROR("In loadAsciiSingleBinaryDescriptor, a descriptor isn't the right size"); 00390 } 00391 for(int j=0; j<width; j++) { 00392 // C99 strtod handles NAN's and INF's. 00393 mat_i[j] = strtod(inp_element.substr(j,1).c_str(), 0); 00394 } 00395 } 00396 } 00397 00398 template<class T> 00399 void loadAscii(const string& filename, TVec<T>& vec) 00400 { 00401 ifstream in(filename.c_str()); 00402 if(!in) 00403 PLERROR("In loadAscii could not open file %s for reading",filename.c_str()); 00404 00405 int size = -1; 00406 in >> size; 00407 if (size<0 || size>1e10) 00408 PLERROR("In Vec::loadAscii the file is probably not in the right format: size=%d", size); 00409 vec.resize(size); 00410 typename TVec<T>::iterator it = vec.begin(); 00411 typename TVec<T>::iterator itend = vec.end(); 00412 00413 // We are now more careful about the possibility of the stream being in a 00414 // bad state 00415 string inp_element; 00416 for(; it!=itend; ++it) { 00417 // C99 strtod handles NAN's and INF's. 00418 if (in) { 00419 in >> inp_element; 00420 *it = strtod(inp_element.c_str(), 0); 00421 } 00422 if (!in) { 00423 in.clear(); 00424 *it = MISSING_VALUE; 00425 } 00426 } 00427 } 00428 00429 // norman: very stupid function to solve compiler error of VS .NET (see declaration) 00430 template<class T> 00431 void saveAscii(const string& filename, const TMat<T>& mat) 00432 { 00433 saveAscii(filename, mat, TVec<string>()); 00434 } 00435 00436 template<class T> 00437 void saveAscii(const string& filename, const TMat<T>& mat, const TVec<string>& fieldnames) 00438 { 00439 ofstream out(filename.c_str()); 00440 if (!out) 00441 PLERROR("In saveAscii could not open file %s for writing",filename.c_str()); 00442 00443 out << "#size: "<< mat.length() << ' ' << mat.width() << endl; 00444 out.precision(15); 00445 if(fieldnames.size()>0) 00446 { 00447 out << "#: "; 00448 for(int k=0; k<fieldnames.size(); k++) 00449 //there must not be any space in a field name... 00450 out << space_to_underscore(fieldnames[k]) << ' '; 00451 out << endl; 00452 } 00453 00454 for(int i=0; i<mat.length(); i++) 00455 { 00456 const T* row_i = mat[i]; 00457 for(int j=0; j<mat.width(); j++) 00458 out << row_i[j] << ' '; 00459 out << '\n'; 00460 } 00461 } 00462 00463 template<class T> 00464 void saveAscii(const string& filename, const TVec<T>& vec) 00465 { 00466 ofstream out(filename.c_str()); 00467 if (!out) 00468 PLERROR("In saveAscii: could not open file %s for writing",filename.c_str()); 00469 00470 out << vec.length() << endl; 00471 out.precision(15); 00472 00473 typename TVec<T>::iterator it = vec.begin(); 00474 typename TVec<T>::iterator itend = vec.end(); 00475 for(; it!=itend; ++it) 00476 out << *it << ' '; 00477 out << endl; 00478 } 00479 00480 template<class T> 00481 void loadAscii(const string& filename, TMat<T>& mat) 00482 { 00483 TVec<std::string> fn; 00484 loadAscii(filename,mat,fn); 00485 } 00486 00487 } // end of namespace PLearn 00488 00489 00490 #endif

Generated on Tue Aug 17 15:58:09 2004 for PLearn by doxygen 1.3.7