Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

MatIO.cc

Go to the documentation of this file.
00001 // -*- C++ -*- 00002 00003 // PLearn (A C++ Machine Learning Library) 00004 // Copyright (C) 1998 Pascal Vincent 00005 // Copyright (C) 1999-2002 Pascal Vincent, Yoshua Bengio and University of Montreal 00006 // 00007 00008 // Redistribution and use in source and binary forms, with or without 00009 // modification, are permitted provided that the following conditions are met: 00010 // 00011 // 1. Redistributions of source code must retain the above copyright 00012 // notice, this list of conditions and the following disclaimer. 00013 // 00014 // 2. Redistributions in binary form must reproduce the above copyright 00015 // notice, this list of conditions and the following disclaimer in the 00016 // documentation and/or other materials provided with the distribution. 00017 // 00018 // 3. The name of the authors may not be used to endorse or promote 00019 // products derived from this software without specific prior written 00020 // permission. 00021 // 00022 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00023 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00024 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00025 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00026 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00027 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00028 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00029 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00030 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00031 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00032 // 00033 // This file is part of the PLearn library. For more information on the PLearn 00034 // library, go to the PLearn Web site at www.plearn.org 00035 00036 00037 /* ******************************************************* 00038 * $Id: MatIO.cc,v 1.15 2004/08/02 21:04:00 mariusmuja Exp $ 00039 * This file is part of the PLearn library. 00040 ******************************************************* */ 00041 00042 #include "MatIO.h" 00043 //#include "stringutils.h" 00044 //#include "fileutils.h" 00045 00046 namespace PLearn { 00047 using namespace std; 00048 00049 00050 /* 00051 A word on the old PLearn native binary file format for matrices and vectors (.pmat .pvec): 00052 00053 The file starts with a 64 bytes ASCII ascii header, directly followed by 00054 the data in binary format (a simple memory dump, 4 bytes per value for 00055 floats and 8 bytes for doubles). The ascii header gives the data type, the 00056 dimensions, and the type of binary representation, as shown in the examples 00057 below. The remaining of the 64 bytes is filled with white spaces (' ') and 00058 a final newline ('\n') The 64th byte must always be a newline, this will be 00059 checked by the loading code. 00060 00061 The data in these files can be memory mapped if desired. Notice that 00062 memory-mapping requires that the architecture has the same memory 00063 representation (big endian or little endian) that is used in the file, as 00064 specified in the header (see below) Otherwise you'll get an error message 00065 telling you of the problem. However traditional loading does not require 00066 this; load will 'translate' the representation if necessary. 00067 00068 Examples of header lines: 00069 00070 A 100 element vector of floats (4bytes each) in little endian binary representation: 00071 VECTOR 100 FLOAT LITTLE_ENDIAN 00072 00073 A 100 element vector of doubles (8 bytes each) in big endian binary representation: 00074 VECTOR 100 double BIG_ENDIAN 00075 00076 A 20x10 matrix of doubles in little endian binary representation 00077 MATRIX 20 10 DOUBLE LITTLE_ENDIAN 00078 00079 A 20x10 matrix of floats in big endian binary representation 00080 MATRIX 20 10 FLOAT BIG_ENDIAN 00081 00082 As a convention, we will use the following file extensions: .pmat and .pvec 00083 (p stands for PLearn) (or possibly .lpmat .bpmat and .lpvec .bpvec if you 00084 wish to keep both a little endian and a big endian version of the matrix in 00085 the same place, to allow memory-mapping on different platforms) 00086 */ 00087 00088 void loadMat(const string& file_name, TMat<float>& mat) 00089 { 00090 string ext = extract_extension(file_name); 00091 // See if we know the extension... 00092 if(ext==".amat") 00093 loadAscii(file_name, mat); 00094 else if (ext==".pmat" || ext==".lpmat" || ext==".bpmat") 00095 loadPMat(file_name,mat); 00096 else // try to guess the format from the header 00097 { 00098 ifstream in(file_name.c_str()); 00099 if(!in) 00100 PLERROR("In loadMat: could not open file %s",file_name.c_str()); 00101 char c = in.get(); 00102 in.close(); 00103 if(c=='M') // it's most likely a .pmat format 00104 loadPMat(file_name,mat); 00105 else 00106 loadAscii(file_name,mat); 00107 } 00108 } 00109 00110 void loadMat(const string& file_name, TMat<double>& mat) 00111 { 00112 string ext = extract_extension(file_name); 00113 // See if we know the extension... 00114 if(ext==".amat") 00115 loadAscii(file_name, mat); 00116 else if (ext==".pmat" || ext==".lpmat" || ext==".bpmat") 00117 loadPMat(file_name,mat); 00118 else // try to guess the format from the header 00119 { 00120 ifstream in(file_name.c_str()); 00121 if(!in) 00122 PLERROR("In loadMat: could not open file %s",file_name.c_str()); 00123 char c = in.get(); 00124 in.close(); 00125 if(c=='M') // it's most likely a .pmat format 00126 loadPMat(file_name,mat); 00127 else 00128 loadAscii(file_name,mat); 00129 } 00130 } 00131 00132 00133 void loadVec(const string& file_name, TVec<float>& vec) 00134 { 00135 const char* filename = file_name.c_str(); 00136 char* suffix = strrchr(filename,'.'); 00137 if (!suffix || strcmp(suffix,".avec")==0) 00138 loadAscii(file_name, vec); 00139 else if (strcmp(suffix,".pvec")==0 || strcmp(suffix,".lpvec")==0 || strcmp(suffix,".bpvec")==0) 00140 loadPVec(file_name,vec); 00141 else 00142 PLERROR("In loadVec: unknown file extension"); 00143 } 00144 00145 void loadVec(const string& file_name, TVec<double>& vec) 00146 { 00147 const char* filename = file_name.c_str(); 00148 char* suffix = strrchr(filename,'.'); 00149 if (!suffix || strcmp(suffix,".avec")==0) 00150 loadAscii(file_name, vec); 00151 else if (strcmp(suffix,".pvec")==0 || strcmp(suffix,".lpvec")==0 || strcmp(suffix,".bpvec")==0) 00152 loadPVec(file_name,vec); 00153 else 00154 PLERROR("In loadVec: unknown file extension"); 00155 } 00156 00157 // Old native PLearn format (.pvec and .pmat) 00158 // DATAFILE_HEADER_LENGTH is 64 and is defined in general.h 00159 00160 void savePVec(const string& filename, const TVec<float>& vec) 00161 { 00162 FILE* f = fopen(filename.c_str(),"wb"); 00163 if (!f) 00164 PLERROR("In savePVec, could not open file %s for writing",filename.c_str()); 00165 00166 char header[DATAFILE_HEADERLENGTH]; 00167 00168 #ifdef LITTLEENDIAN 00169 sprintf(header,"VECTOR %d FLOAT LITTLE_ENDIAN", vec.length()); 00170 #endif 00171 #ifdef BIGENDIAN 00172 sprintf(header,"VECTOR %d FLOAT BIG_ENDIAN", vec.length()); 00173 #endif 00174 00175 // Pad the header with whites and terminate it with '\n' 00176 for(int pos=(int)strlen(header); pos<DATAFILE_HEADERLENGTH; pos++) 00177 header[pos] = ' '; 00178 header[DATAFILE_HEADERLENGTH-1] = '\n'; 00179 00180 // write the header to the file 00181 fwrite(header,DATAFILE_HEADERLENGTH,1,f); 00182 00183 // write the data to the file 00184 if(0 < vec.length()) 00185 { 00186 const float* p = vec.data(); 00187 fwrite(p,sizeof(float),vec.length(),f); 00188 } 00189 00190 fclose(f); 00191 } 00192 00193 void savePVec(const string& filename, const TVec<double>& vec) 00194 { 00195 FILE* f = fopen(filename.c_str(),"wb"); 00196 if (!f) 00197 PLERROR("In savePVec, could not open file %s for writing",filename.c_str()); 00198 00199 char header[DATAFILE_HEADERLENGTH]; 00200 00201 #ifdef LITTLEENDIAN 00202 sprintf(header,"VECTOR %d DOUBLE LITTLE_ENDIAN", vec.length()); 00203 #endif 00204 #ifdef BIGENDIAN 00205 sprintf(header,"VECTOR %d DOUBLE BIG_ENDIAN", vec.length()); 00206 #endif 00207 00208 // Pad the header with whites and terminate it with '\n' 00209 for(int pos=(int)strlen(header); pos<DATAFILE_HEADERLENGTH; pos++) 00210 header[pos] = ' '; 00211 header[DATAFILE_HEADERLENGTH-1] = '\n'; 00212 00213 // write the header to the file 00214 fwrite(header,DATAFILE_HEADERLENGTH,1,f); 00215 00216 // write the data to the file 00217 if(0 < vec.length()) 00218 { 00219 const double* p = vec.data(); 00220 fwrite(p,sizeof(double),vec.length(),f); 00221 } 00222 00223 fclose(f); 00224 } 00225 00226 void loadPVec(const string& filename, TVec<float>& vec) 00227 { 00228 char header[DATAFILE_HEADERLENGTH]; 00229 char matorvec[20]; 00230 char datatype[20]; 00231 char endiantype[20]; 00232 int the_length; 00233 00234 FILE* f = fopen(filename.c_str(),"r"); 00235 if (!f) 00236 PLERROR("In loadPVec, could not open file %s for reading",filename.c_str()); 00237 fread(header,DATAFILE_HEADERLENGTH,1,f); 00238 if(header[DATAFILE_HEADERLENGTH-1]!='\n') 00239 PLERROR("In loadPVec(%s), wrong header for PLearn binary vector format. Please use checkheader (in PLearn/Scripts) to check the file.",filename.c_str()); 00240 sscanf(header,"%s%d%s%s",matorvec,&the_length,datatype,endiantype); 00241 if (strcmp(matorvec,"VECTOR")!=0) 00242 PLERROR("In loadPVec(%s), wrong header for PLearn binary vector format. Please use checkheader (in PLearn/Scripts) to check the file.",filename.c_str()); 00243 00244 vec.resize(the_length); 00245 00246 bool is_file_bigendian = false; 00247 if (strcmp(endiantype,"LITTLE_ENDIAN")==0) 00248 is_file_bigendian = false; 00249 else if (strcmp(endiantype,"BIG_ENDIAN")==0) 00250 is_file_bigendian = true; 00251 else 00252 PLERROR("In loadPVec, wrong header for PLearn binary vector format. Please use checkheader (in PLearn/Scripts) to check the file."); 00253 00254 if (strcmp(datatype,"FLOAT")==0) 00255 { 00256 float* p = vec.data(); 00257 fread_float(f,p,vec.length(),is_file_bigendian); 00258 } 00259 00260 else if (strcmp(datatype,"DOUBLE")==0) 00261 { 00262 double* buffer = new double[vec.length()]; 00263 float* p = vec.data(); 00264 fread_double(f,buffer,vec.length(),is_file_bigendian); 00265 for(int j=0; j<vec.length(); j++) 00266 p[j] = float(buffer[j]); 00267 delete[] buffer; 00268 } 00269 00270 else 00271 PLERROR("In loadPVec, wrong header for PLearn binary vector format. Please use checkheader (in PLearn/Scripts) to check the file."); 00272 00273 fclose(f); 00274 } 00275 00276 void loadPVec(const string& filename, TVec<double>& vec) 00277 { 00278 char header[DATAFILE_HEADERLENGTH]; 00279 char matorvec[20]; 00280 char datatype[20]; 00281 char endiantype[20]; 00282 int the_length; 00283 00284 FILE* f = fopen(filename.c_str(),"r"); 00285 if (!f) 00286 PLERROR("In loadPVec, could not open file %s for reading",filename.c_str()); 00287 fread(header,DATAFILE_HEADERLENGTH,1,f); 00288 if(header[DATAFILE_HEADERLENGTH-1]!='\n') 00289 PLERROR("In loadPVec(%s), wrong header for PLearn binary vector format. Please use checkheader (in PLearn/Scripts) to check the file.",filename.c_str()); 00290 sscanf(header,"%s%d%s%s",matorvec,&the_length,datatype,endiantype); 00291 if (strcmp(matorvec,"VECTOR")!=0) 00292 PLERROR("In loadPVec(%s), wrong header for PLearn binary vector format. Please use checkheader (in PLearn/Scripts) to check the file.",filename.c_str()); 00293 00294 vec.resize(the_length); 00295 00296 bool is_file_bigendian = false; 00297 if (strcmp(endiantype,"LITTLE_ENDIAN")==0) 00298 is_file_bigendian = false; 00299 else if (strcmp(endiantype,"BIG_ENDIAN")==0) 00300 is_file_bigendian = true; 00301 else 00302 PLERROR("In loadPVec, wrong header for PLearn binary vector format. Please use checkheader (in PLearn/Scripts) to check the file."); 00303 00304 if (0 < the_length) 00305 { 00306 if (strcmp(datatype,"FLOAT")==0) 00307 { 00308 float* buffer = new float[vec.length()]; 00309 double* p = vec.data(); 00310 fread_float(f,buffer,vec.length(),is_file_bigendian); 00311 for(int j=0; j<vec.length(); j++) 00312 p[j] = double(buffer[j]); 00313 delete[] buffer; 00314 } 00315 00316 else if (strcmp(datatype,"DOUBLE")==0) 00317 { 00318 double* p = vec.data(); 00319 fread_double(f,p,vec.length(),is_file_bigendian); 00320 } 00321 00322 else 00323 PLERROR("In loadPVec, wrong header for PLearn binary vector format. Please use checkheader (in PLearn/Scripts) to check the file."); 00324 } 00325 fclose(f); 00326 } 00327 00328 00329 void savePMat(const string& filename, const TMat<float>& mat) 00330 { 00331 FILE* f = fopen(filename.c_str(),"wb"); 00332 if (!f) 00333 PLERROR("In savePMat, could not open file %s for writing",filename.c_str()); 00334 00335 char header[DATAFILE_HEADERLENGTH]; 00336 00337 #ifdef LITTLEENDIAN 00338 sprintf(header,"MATRIX %d %d FLOAT LITTLE_ENDIAN", mat.length(), mat.width()); 00339 #endif 00340 #ifdef BIGENDIAN 00341 sprintf(header,"MATRIX %d %d FLOAT BIG_ENDIAN", mat.length(), mat.width()); 00342 #endif 00343 00344 // Pad the header with whites and terminate it with '\n' 00345 for(int pos=(int)strlen(header); pos<DATAFILE_HEADERLENGTH; pos++) 00346 header[pos] = ' '; 00347 header[DATAFILE_HEADERLENGTH-1] = '\n'; 00348 00349 // write the header to the file 00350 fwrite(header,DATAFILE_HEADERLENGTH,1,f); 00351 00352 // write the data to the file 00353 for (int i=0; i<mat.length(); i++) 00354 { 00355 const float* p = mat[i]; 00356 fwrite(p,sizeof(float),mat.width(),f); 00357 } 00358 fclose(f); 00359 } 00360 00361 void savePMat(const string& filename, const TMat<double>& mat) 00362 { 00363 FILE* f = fopen(filename.c_str(),"wb"); 00364 if (!f) 00365 PLERROR("In savePMat, could not open file %s for writing",filename.c_str()); 00366 00367 char header[DATAFILE_HEADERLENGTH]; 00368 00369 #ifdef LITTLEENDIAN 00370 sprintf(header,"MATRIX %d %d DOUBLE LITTLE_ENDIAN", mat.length(), mat.width()); 00371 #endif 00372 #ifdef BIGENDIAN 00373 sprintf(header,"MATRIX %d %d DOUBLE BIG_ENDIAN", mat.length(), mat.width()); 00374 #endif 00375 00376 // Pad the header with whites and terminate it with '\n' 00377 for(int pos=(int)strlen(header); pos<DATAFILE_HEADERLENGTH; pos++) 00378 header[pos] = ' '; 00379 header[DATAFILE_HEADERLENGTH-1] = '\n'; 00380 00381 // write the header to the file 00382 fwrite(header,DATAFILE_HEADERLENGTH,1,f); 00383 00384 // write the data to the file 00385 for (int i=0; i<mat.length(); i++) 00386 { 00387 const double* p = mat[i]; 00388 fwrite(p,sizeof(double),mat.width(),f); 00389 } 00390 fclose(f); 00391 } 00392 00393 void loadPMat(const string& filename, TMat<float>& mat) 00394 { 00395 char header[DATAFILE_HEADERLENGTH]; 00396 char matorvec[20]; 00397 char datatype[20]; 00398 char endiantype[20]; 00399 int the_length; 00400 int the_width; 00401 00402 FILE* f = fopen(filename.c_str(),"r"); 00403 if (!f) 00404 PLERROR("In loadPMat, could not open file %s for reading",filename.c_str()); 00405 fread(header,DATAFILE_HEADERLENGTH,1,f); 00406 if(header[DATAFILE_HEADERLENGTH-1]!='\n') 00407 PLERROR("In loadPMat(%s), wrong header for PLearn binary matrix format. Please use checkheader (in PLearn/Scripts) to check the file.",filename.c_str()); 00408 sscanf(header,"%s%d%d%s%s",matorvec,&the_length,&the_width,datatype,endiantype); 00409 if (strcmp(matorvec,"MATRIX")!=0) 00410 PLERROR("In loadPMat(%s), wrong header for PLearn binary matrix format. Please use checkheader (in PLearn/Scripts) to check the file.",filename.c_str()); 00411 00412 mat.resize(the_length, the_width); 00413 00414 bool is_file_bigendian = true; 00415 if (strcmp(endiantype,"LITTLE_ENDIAN")==0) 00416 is_file_bigendian = false; 00417 else if (strcmp(endiantype,"BIG_ENDIAN")==0) 00418 is_file_bigendian = true; 00419 else 00420 PLERROR("In loadPMat, wrong header for PLearn binary matrix format. Please use checkheader (in PLearn/Scripts) to check the file."); 00421 00422 if (strcmp(datatype,"FLOAT")==0) 00423 { 00424 for (int i=0; i<mat.length(); i++) 00425 { 00426 float* p = mat[i]; 00427 fread_float(f,p,mat.width(),is_file_bigendian); 00428 } 00429 } 00430 else if (strcmp(datatype,"DOUBLE")==0) 00431 { 00432 double* buffer = new double[mat.width()]; 00433 for (int i=0; i<mat.length(); i++) 00434 { 00435 float* p = mat[i]; 00436 fread_double(f,buffer,mat.width(),is_file_bigendian); 00437 for(int j=0; j<mat.width(); j++) 00438 p[j] = float(buffer[j]); 00439 } 00440 delete[] buffer; 00441 } 00442 00443 else 00444 PLERROR("In loadPMat, wrong header for PLearn binary matrix format. Please use checkheader (in PLearn/Scripts) to check the file."); 00445 00446 fclose(f); 00447 } 00448 00449 void loadPMat(const string& filename, TMat<double>& mat) 00450 { 00451 char header[DATAFILE_HEADERLENGTH]; 00452 char matorvec[20]; 00453 char datatype[20]; 00454 char endiantype[20]; 00455 int the_length=0; 00456 int the_width=0; 00457 00458 FILE* f = fopen(filename.c_str(),"r"); 00459 if (!f) 00460 PLERROR("In loadPMat, could not open file %s for reading",filename.c_str()); 00461 fread(header,DATAFILE_HEADERLENGTH,1,f); 00462 if(header[DATAFILE_HEADERLENGTH-1]!='\n') 00463 PLERROR("In loadPMat(%s), wrong header for PLearn binary matrix format. Please use checkheader (in PLearn/Scripts) to check the file.",filename.c_str()); 00464 sscanf(header,"%s%d%d%s%s",matorvec,&the_length,&the_width,datatype,endiantype); 00465 if (strcmp(matorvec,"MATRIX")!=0) 00466 PLERROR("In loadPMat(%s), wrong header for PLearn binary matrix format. Please use checkheader (in PLearn/Scripts) to check the file.",filename.c_str()); 00467 00468 mat.resize(the_length, the_width); 00469 00470 bool is_file_bigendian = true; 00471 if (strcmp(endiantype,"LITTLE_ENDIAN")==0) 00472 is_file_bigendian = false; 00473 else if (strcmp(endiantype,"BIG_ENDIAN")==0) 00474 is_file_bigendian = true; 00475 else 00476 PLERROR("In loadPMat, wrong header for PLearn binary matrix format. Please use checkheader (in PLearn/Scripts) to check the file."); 00477 00478 if (strcmp(datatype,"FLOAT")==0) 00479 { 00480 float* buffer = new float[mat.width()]; 00481 for (int i=0; i<mat.length(); i++) 00482 { 00483 double* p = mat[i]; 00484 fread_float(f,buffer,mat.width(),is_file_bigendian); 00485 for(int j=0; j<mat.width(); j++) 00486 p[j] = double(buffer[j]); 00487 } 00488 delete[] buffer; 00489 } 00490 00491 else if (strcmp(datatype,"DOUBLE")==0) 00492 { 00493 for (int i=0; i<mat.length(); i++) 00494 { 00495 double* p = mat[i]; 00496 fread_double(f,p,mat.width(),is_file_bigendian); 00497 } 00498 } 00499 00500 else 00501 PLERROR("In loadPMat, wrong header for PLearn binary matrix format. Please use checkheader (in PLearn/Scripts) to check the file."); 00502 00503 fclose(f); 00504 } 00505 00506 /* 00507 void newLoadAscii(const string& filename, TMat<double>& mat) 00508 { 00509 ifstream in(filename.c_str()); 00510 if(!in) 00511 PLERROR("Could not open file %s for reading", filename.c_str()); 00512 00513 int length = -1; 00514 int width = -1; 00515 bool could_be_old_amat=true; // true while there is still a chance that this be an "old" amat format (length and width in first row with no starting ##) 00516 int c = in.get(); 00517 while(isspace(c)) 00518 c = in.get(); 00519 00520 if(c=='#') // starts with a comment 00521 { 00522 could_be_old_amat = false; 00523 00524 // If it's followed by another # and only 2 numbers before the end of line, it's a new .amat format 00525 if(in.get()=='#') 00526 { 00527 in >> length; 00528 in >> width; 00529 } 00530 c = in.get(); 00531 while(c==' ' || c=='\t') 00532 c = in.get(); 00533 if(c!='\n' && c!='\r') // it wasn't a new .amat format after all... 00534 length = -1; 00535 } 00536 00537 if(length==-1) // still looking for size info... 00538 { 00539 in.unget(); 00540 string line; 00541 getNextNonBlankLine(in,line); 00542 int nfields1 = split(line).size(); 00543 getNextNonBlankLine(in,line); 00544 int nfields2 = split(line).size(); 00545 if(could_be_old_amat && nfields1==2) // could be an old .amat with first 2 numbers being length width 00546 { 00547 in.seekg(0); 00548 real a, b; 00549 in >> a >> b; 00550 if(real(int(a))==a && real(int(b))==b && a>0 && b>0 && int(b)==nfields2) // it's clearly an old .amat 00551 { 00552 length = int(a); 00553 width = int(b); 00554 } 00555 } 00556 00557 if(length==-1) // still don't know size info... 00558 { 00559 if(nfields1==nfields2) // looks like a plain ascii file 00560 { 00561 length = countNonBlankLinesOfFile(filename); 00562 width = nfields1; 00563 in.seekg(0); 00564 } 00565 } 00566 } 00567 00568 if(length==-1) 00569 PLERROR("In loadAscii: couldn't determine file format automatically"); 00570 00571 mat.resize(length,width); 00572 for(int i=0; i<length; i++) 00573 { 00574 real* mat_i = mat[i]; 00575 skipBlanks(in); 00576 for(int j=0; j<width; j++) 00577 in >> mat_i[j]; 00578 } 00579 00580 } 00581 */ 00582 00583 // Gnuplot format 00584 void saveGnuplot(const string& filename, const Vec& vec) 00585 { 00586 FILE* f=fopen(filename.c_str(),"w"); 00587 if (!f) 00588 PLERROR("In Vec::saveGnuplot, couldn't open %s for writing",filename.c_str()); 00589 00590 real* p = vec.data(); 00591 for (int i=0; i<vec.length(); i++, p++) 00592 fprintf(f,"%e\n", *p); 00593 fclose(f); 00594 } 00595 00596 // Gnuplot format 00597 void saveGnuplot(const string& filename, const Mat& mat) 00598 { 00599 ofstream out(filename.c_str()); 00600 if (!out) 00601 PLERROR("In saveGnuplot, couldn't open %s for writing.",filename.c_str()); 00602 out.flags(ios::left); 00603 for(int i=0; i<mat.length(); i++) 00604 { 00605 const real* m_i = mat[i]; 00606 for(int j=0; j<mat.width(); j++) 00607 out << setw(11) << m_i[j] << ' '; 00608 out << "\n"; 00609 } 00610 out.flush(); 00611 } 00612 00613 void loadGnuplot(const string& filename, Mat& mat) 00614 { 00615 ifstream in(filename.c_str()); 00616 if (!in) 00617 PLERROR("In loadGnuplot, couldn't open %s for reading.",filename.c_str()); 00618 00619 char buf[10000]; 00620 // First pass to count the number of rows and columns 00621 int nrows = 0; 00622 int ncols = 0; 00623 in.getline(buf,sizeof(buf)-1); 00624 while(in) 00625 { 00626 int pos=0; 00627 while(buf[pos]==' ' || buf[pos]=='\t') 00628 pos++; 00629 if(buf[pos]!='#' && buf[pos]!='\n' && buf[pos]!='\r') 00630 { 00631 nrows++; 00632 if(ncols==0) 00633 { 00634 istrstream inputline(buf); 00635 real value; 00636 while(inputline) 00637 { 00638 inputline >> value; 00639 ncols++; 00640 } 00641 ncols--; // correct count 00642 } 00643 } 00644 in.getline(buf,sizeof(buf)-1); 00645 } 00646 in.close(); 00647 mat.resize(nrows,ncols); 00648 in.open(filename.c_str()); 00649 for(int i=0; i<nrows; i++) 00650 { 00651 char firstchar = '#'; 00652 while(firstchar == '#' || firstchar == '\n' || firstchar=='\r') 00653 { 00654 in.getline(buf,sizeof(buf)-1); 00655 int pos=0; 00656 while(buf[pos]==' ' || buf[pos]=='\t') 00657 pos++; 00658 firstchar = buf[pos]; 00659 } 00660 istrstream inputline(buf); 00661 for(int j=0; j<ncols; j++) 00662 inputline >> mat(i,j); 00663 } 00664 in.close(); 00665 } 00666 00667 void matlabSave( const string& dir, const string& plot_title, const Vec& data, 00668 const Vec& add_col, const Vec& bounds, string legend, bool save_plot) 00669 { 00670 Vec bidon; 00671 Mat mat(data.length(), 1); 00672 mat << data; 00673 TVec<string> legd; 00674 if(legend != "") 00675 legd.append(legend); 00676 matlabSave(dir, plot_title, 00677 bidon, 00678 mat, add_col, bounds, legd, save_plot); 00679 } 00680 00681 void matlabSave( const string& dir, const string& plot_title, 00682 const Vec& xValues, 00683 const Vec& yValues, const Vec& add_col, const Vec& bounds, string legend, bool save_plot) 00684 { 00685 Mat mat(yValues.length(), 1); 00686 mat << yValues; 00687 TVec<string> legd; 00688 if(legend != "") 00689 legd.append(legend); 00690 matlabSave(dir, plot_title, 00691 xValues, 00692 mat, add_col, bounds, legd, save_plot); 00693 } 00694 00695 void matlabSave( const string& dir, const string& plot_title, const Mat& data, 00696 const Vec& add_col, const Vec& bounds, TVec<string> legend, bool save_plot) 00697 { 00698 Vec bid; 00699 matlabSave(dir, plot_title, bid, data, add_col, bounds, legend, save_plot); 00700 } 00701 00710 void matlabSave( const string& dir, const string& plot_title, 00711 const Vec& xValues, 00712 const Mat& yValues, const Vec& add_col, const Vec& bounds, TVec<string> legend, bool save_plot) 00713 { 00714 force_mkdir(dir); 00715 string directory = append_slash(abspath(dir)); 00716 force_mkdir(directory+"Images/"); 00717 00718 int w = yValues.width(); 00719 00720 ofstream out; 00721 string vec_fname = directory + plot_title + ".mmat"; 00722 out.open(vec_fname.c_str(), ofstream::out | ofstream::trunc); 00723 00724 real startX = 0.0; 00725 int xLen = xValues.length(); 00726 if(xLen != 0) 00727 { 00728 if(xLen == yValues.length()) 00729 startX = MISSING_VALUE; 00730 else if(xLen == 1) 00731 startX = xValues[0]; 00732 else 00733 PLERROR("matlabSave:\n" 00734 "1) If xValues is empty, the yValues are plotted against the row indices.\n" 00735 "2) If xValues is not empty and its length is not equal to the length of yValues, \n" 00736 "then its length must be one and the value xValues[0] will be the start index for the xValues."); 00737 } 00738 00739 for(int d = 0; d < yValues.length(); d++) 00740 { 00741 if(is_missing(startX)) 00742 out << xValues[d] << "\t"; 00743 else 00744 out << (startX+d) << "\t"; 00745 00746 for(int col=0; col < w; col++) 00747 out << yValues(d, col) << "\t"; 00748 00749 for(int add=0; add < add_col.length(); add++) 00750 out << add_col[add] << "\t"; 00751 out << endl; 00752 } 00753 out.close(); 00754 00755 string m_fname = directory + plot_title + ".m"; 00756 out.open(m_fname.c_str(), ofstream::out | ofstream::trunc); 00757 out << "load " << vec_fname << " -ascii" << endl 00758 << plot_title << "= sortrows(" << plot_title << ")" << endl 00759 << "h = plot(" 00760 //--- X Values 00761 << plot_title << "(:,1), " 00763 //--- Y Values 00764 << plot_title << "(:,2:" << (1+w) << "));" << endl 00766 << "set(h, 'LineWidth', 1.0)" << endl 00767 << "set(gcf, 'Position', [0, 0, 1000, 750])" << endl 00768 << "hold on" << endl; 00769 00770 if(legend.isNotEmpty()) 00771 { 00772 int leg = legend.length(); 00773 int wid = yValues.width(); 00774 if(leg != wid) 00775 { 00776 if(legend[0] == "Numbers") 00777 { 00778 legend.resize(wid); 00779 for(int c=0; c < wid; c++) 00780 legend[c] = tostring(c); 00781 } 00782 else 00783 PLERROR("TimeSeriesAnalysis::matlab_save: legend.length() = %d != %d = yValues.width()", 00784 leg, wid); 00785 } 00786 out << "legend(h"; 00787 for(int l=0; l < leg; l++) 00788 { 00789 legend[l] = underscore_to_space(legend[l]); 00790 out << ", '" << legend[l] << "'"; 00791 } 00792 out << ");" << endl; 00793 } 00794 00795 for(int add=0; add < add_col.length(); add++) 00796 out << "g = plot(" << plot_title 00797 << "(:," << (2+w+add) << "));" 00798 << endl 00799 << "set(g, 'Color', [0.5 0.5 0.5])" << endl; 00800 00801 if(bounds.isNotEmpty()) 00802 out << "xlim([" << bounds[0] << ", " << bounds[1] << "])" << endl 00803 << "ylim([" << bounds[2] << ", " << bounds[3] << "])" << endl; 00804 00805 out << "title('" << underscore_to_space(plot_title) << "')" << endl; 00806 00807 if(save_plot) 00808 out << "print('-dpsc2', '" 00809 << (directory+"Images/") 00810 << plot_title << ".eps')" << endl; 00811 00812 out.close(); 00813 } 00814 00815 // Ascii without size 00816 void saveAsciiWithoutSize(const string& filename, const Vec& vec) 00817 { 00818 FILE *f; 00819 f=fopen(filename.c_str(),"w"); 00820 if (!f) 00821 PLERROR("In Vec::saveAscii: could not open file %s for writing",filename.c_str()); 00822 int i; 00823 char buffer[100]; 00824 real *p= vec.data(); 00825 for (i=0;i<vec.length();i++,p++) 00826 { 00827 pretty_print_number(buffer,*p); 00828 fprintf(f,"%s ",buffer); 00829 } 00830 fclose(f); 00831 } 00832 00833 // Ascii without size (load assumes size is already correctly set) 00834 void loadAsciiWithoutSize(const string& filename, const Vec& vec) 00835 { 00836 FILE *f; 00837 f=fopen(filename.c_str(),"r"); 00838 if (!f) 00839 PLERROR("In Vec::loadAsciiWithoutSize could not open file %s for reading",filename.c_str()); 00840 00841 if (vec.length() < 1) 00842 PLERROR("In Vec::loadAsciiWithoutSize, the size of the vector is not defined yet"); 00843 00844 real* p = vec.data(); 00845 for (int i=0;i<vec.length();i++,p++) 00846 { 00847 #ifdef USEDOUBLE 00848 fscanf(f,"%lf",p); 00849 #else 00850 fscanf(f,"%f",p); 00851 #endif 00852 } 00853 } 00854 00855 void saveAsciiWithoutSize(const string& filename, const Mat& mat) 00856 { 00857 FILE *f; 00858 f=fopen(filename.c_str(),"w"); 00859 if (!f) 00860 PLERROR("In saveAscii, could not open file %s for writing",filename.c_str()); 00861 char buffer[100]; 00862 for(int i=0; i<mat.length(); i++) 00863 { 00864 const real* row_i = mat[i]; 00865 for(int j=0; j<mat.width(); j++) 00866 { 00867 pretty_print_number(buffer,row_i[j]); 00868 fprintf(f,"%s ",buffer); 00869 } 00870 fprintf(f,"\n"); 00871 } 00872 fclose(f); 00873 } 00874 00875 void loadAsciiWithoutSize(const string& filename, const Mat& mat) 00876 { 00877 00878 FILE *f = fopen(filename.c_str(),"r"); 00879 if (!f) 00880 PLERROR("In loadAsciiWithoutSize, could not open file %s for reading.",filename.c_str()); 00881 00882 if (mat.length() < 1 || mat.width() < 1) 00883 PLERROR("In loadAsciiWithoutSize, the size of the matrix is not defined yet"); 00884 00885 for(int i=0; i<mat.length(); i++) 00886 { 00887 real* row_i = mat[i]; 00888 for(int j=0; j<mat.width(); j++) 00889 #ifdef USEDOUBLE 00890 fscanf(f,"%lf",&row_i[j]); 00891 #else 00892 fscanf(f,"%f",&row_i[j]); 00893 #endif 00894 } 00895 } 00896 00897 00898 // Native SN format (Fmat) 00899 void saveSNMat(const string& filename, const Mat& mat) 00900 { 00901 FILE *f=fopen(filename.c_str(),"w"); 00902 int i=0x1e3d4c51L; 00903 int j=0; 00904 fwrite_int(f,&i,1); 00905 i=2; /* number of dimensions = 2 for a matrix */ 00906 fwrite_int(f,&i,1); 00907 int length = mat.length(); 00908 int width = mat.width(); 00909 fwrite_int(f,&length,1); 00910 fwrite_int(f,&width,1); 00911 while (i++ < 3) fwrite_int(f,&j,1); 00912 for (i=0;i<length;i++) 00913 fwrite_float(f,mat[i],width); 00914 fclose(f); 00915 } 00916 00917 Mat loadSNMat(const string& filename) 00918 { 00919 FILE *f; 00920 int d, nd; int i; 00921 int imn; 00922 int length, width; 00923 00924 f=fopen(filename.c_str(),"r"); 00925 if (!f) 00926 PLERROR("In loadFmat, could not open file %s for reading",filename.c_str()); 00927 00928 fread_int(f,&imn,1); 00929 if (imn!= (0x1e3d4c51L)) 00930 PLERROR("in loadFmat, File does not have the right format"); 00931 00932 /* read ndim */ 00933 fread_int(f,&nd,1); 00934 if (nd<0 || nd>5) 00935 PLERROR("In loadFmat, Corrupted file: Bad number of dimensions"); 00936 if (nd != 2) 00937 PLERROR("In loadFmat, ndim is not 2 (not a matrix!)\n"); 00938 00939 /* read dim */ 00940 d=0; 00941 fread_int(f,&length,1); 00942 d++; 00943 fread_int(f,&width,1); 00944 d++; 00945 while (d++ < 3) 00946 fread(&i, sizeof(int), 1, f); 00947 Mat mat(length,width); 00948 for(i=0; i<length; i++) 00949 fread_float(f, mat[i], width); 00950 fclose(f); 00951 return mat; 00952 } 00953 00954 void saveSNVec(const string& filename, const Vec& vec) 00955 { 00956 FILE* f=fopen(filename.c_str(),"w"); 00957 if(!f) 00958 PLERROR("In Vec::loadSNVec could not open file for writing"); 00959 int i=0x1e3d4c51L; 00960 int j=0; 00961 fwrite_int(f,&i,1); 00962 i=1; /* number of dimensions = 1 for a vector */ 00963 fwrite_int(f,&i,1); 00964 int length = vec.length(); 00965 fwrite_int(f,&length,1); 00966 while (i++ < 3) 00967 fwrite_int(f,&j,1); 00968 fwrite_float(f,vec.data(),length); 00969 fclose(f); 00970 } 00971 00972 Vec loadSNVec(const string& filename) 00973 { 00974 FILE *f; 00975 int d, nd; int i; 00976 int imn; 00977 int size; 00978 00979 f=fopen(filename.c_str(),"r"); 00980 if (!f) 00981 PLERROR("In Vec::loadSNVec could not open file %s for reading",filename.c_str()); 00982 00983 fread_int(f,&imn,1); 00984 if (imn!= (0x1e3d4c51L)) 00985 PLERROR("In Vec::loadSNVec, File does not have the right format"); 00986 00987 /* read ndim */ 00988 fread_int(f,&nd,1); 00989 if (nd<0 || nd>5) 00990 PLERROR("In Vec::loadSNVec, Corrupted file: Bad number of dimensions"); 00991 if (nd != 1) 00992 PLERROR("In Vec::loadSNVec, ndim is not 1 (not a vector!)\n"); 00993 00994 /* read dim */ 00995 d=0; 00996 fread_int(f,&i,1); 00997 size=i;d++; 00998 while (d++ < 3) 00999 fread(&i, sizeof(int), 1, f); 01000 01001 Vec vec(size); 01002 fread_float(f,vec.data(),size); 01003 01004 fclose(f); 01005 return vec; 01006 } 01007 01008 01009 // Native AD format 01010 01011 Mat loadADMat(const string& filename) 01012 { 01013 FILE *f = fopen(filename.c_str(),"r"); 01014 if (!f) 01015 PLERROR("In loadADMat, could not open file %s for reading",filename.c_str()); 01016 int the_length, the_width; 01017 int magic = 0x2345; 01018 int SNidx2fltmagic = 0x0D02; 01019 int m; 01020 fread_int(f,&m,1); 01021 if (m != magic && m != SNidx2fltmagic) 01022 PLERROR("In load, magic number is incorrect: %d != %d",m,magic); 01023 fread_int(f,&the_length,1); 01024 fread_int(f,&the_width,1); 01025 Mat mat(the_length,the_width); 01026 fread_float(f,mat.data(),the_length*the_width); 01027 fclose(f); 01028 return mat; 01029 } 01030 01031 Vec loadADVec(const string& filename) 01032 { 01033 FILE* f = fopen(filename.c_str(),"r"); 01034 if (!f) 01035 PLERROR("In Vec::loadADMat could not open file %s for reading",filename.c_str()); 01036 int thesize; 01037 int magic = 0x3456; 01038 int m; 01039 fread_int(f,&m,1); 01040 if (m != magic) 01041 PLERROR("In new_Vec_from_File_FILE, magic number is incorret: %d != %d",m,magic); 01042 fread_int(f,&thesize,1); 01043 Vec vec(thesize); 01044 fread_float(f,vec.data(),thesize); 01045 fclose(f); 01046 return vec; 01047 } 01048 01049 // Used for calling qsort 01050 static int compare_string_pointers(const void *ts1, const void *ts2) 01051 { 01052 return strcmp(*((char **)ts1),*((char **)ts2)); 01053 } 01054 01055 01056 // UCI machine-learning-database format 01057 Mat loadUCIMLDB(const string& filename, char ****to_symbols, int **to_n_symbols, TVec<int>* the_max_in_col, TVec<string>* header_columns) 01058 { 01059 FILE *f = fopen(filename.c_str(),"r"); 01060 int n_rows= -1, n_cols=1, i,j; 01061 char ***symbols; 01062 int *n_symbols; 01063 #define convert_UCIMLDB_BUF_LEN 10000 01064 char buffer[convert_UCIMLDB_BUF_LEN]; 01065 char *cp=buffer; 01066 char *word=buffer; 01067 char *cp2; 01068 real *p; 01069 int line_len; 01070 01071 if (!f) 01072 PLERROR("In loadUCIMLDB, could not open file %s for reading",filename.c_str()); 01073 01074 if((to_symbols && !to_n_symbols) || (!to_symbols && to_n_symbols)) 01075 PLERROR("In loadUCIMLDB, to_symbols and to_nsymbols must both be provided (non-null), or both be 0"); 01076 01077 /* first figure out number of columns and number of rows */ 01078 bool skip_header = false; 01079 if (header_columns) { 01080 skip_header = true; 01081 } 01082 while (!feof(f)) 01083 { 01084 do { 01085 fgets(buffer,convert_UCIMLDB_BUF_LEN,f); 01086 } while (!feof(f) && (strcmp(buffer,"\n")==0 || strncmp(buffer,";;;",3)==0)); 01087 if (skip_header) { 01088 skip_header = false; 01089 } else { 01090 if (n_rows == -1) { 01091 /* read number of columns */ 01092 while ((cp=strchr(cp,','))) 01093 { 01094 cp++; 01095 n_cols++; 01096 } 01097 } 01098 n_rows++; 01099 } 01100 } 01101 01102 fclose(f); 01103 01104 /* figure out the set of symbols used for each symbolic row, if any */ 01105 symbols = (char ***)calloc(n_cols,sizeof(char **)); 01106 n_symbols = (int *)calloc(n_cols,sizeof(int)); 01107 01108 TVec<int>* max_in_col; 01109 if (the_max_in_col) { 01110 max_in_col = the_max_in_col; 01111 } else { 01112 max_in_col = new TVec<int>(); 01113 } 01114 max_in_col->resize(n_cols); 01115 max_in_col->fill(-1); 01116 if(to_symbols) 01117 { 01118 *to_symbols = symbols; 01119 *to_n_symbols = n_symbols; 01120 } 01121 f = fopen(filename.c_str(),"r"); 01122 01123 01124 /* read header columns */ 01125 if (header_columns) { 01126 do { 01127 fgets(buffer,convert_UCIMLDB_BUF_LEN,f); 01128 } while (!feof(f) && (strcmp(buffer,"\n")==0 || strncmp(buffer,";;;",3)==0)); 01129 01130 cp=word=buffer; 01131 01132 while ((cp=strchr(cp,','))) { 01133 *cp=0; 01134 header_columns->append(word); 01135 cp++; 01136 word=cp; 01137 } 01138 header_columns->append(word); 01139 } 01140 01141 for (i=0;i<n_rows;i++) 01142 { 01143 do { 01144 fgets(buffer,convert_UCIMLDB_BUF_LEN,f); 01145 } while (!feof(f) && (strcmp(buffer,"\n")==0 || strncmp(buffer,";;;",3)==0)); 01146 01147 01148 // ignore everything after '|' 01149 char *comm = strchr(buffer,'|'); 01150 if (comm) *comm = '\n'; 01151 01152 line_len=strlen(buffer); 01153 cp=word=buffer; 01154 for (j=0;j<n_cols;j++) 01155 { 01156 /* find next end of word */ 01157 while ((*cp!=',' && *cp!='\n') && cp<=buffer+line_len) cp++; 01158 *cp=0; 01159 /* is this symbolic? */ 01160 cp2=word; 01161 string the_val = word; 01162 01163 if (!pl_isnumber(word) && *cp2!='?') { 01164 /* yes, non-missing symbolic character was found: */ 01165 if (symbols[j]) 01166 { 01167 /* we already had found symbols in this column */ 01168 int w=0; 01169 while (symbols[j][w] && /* look for this symbol */ 01170 strcmp(symbols[j][w],word)!=0 && 01171 w<n_symbols[j]) w++; 01172 if (w==n_rows) 01173 PLERROR("logic error in loadUCIMLDB"); 01174 if (!symbols[j][w]) 01175 { 01176 /* new symbol */ 01177 symbols[j][w] = (char *)calloc(strlen(word)+1,sizeof(char)); 01178 strcpy(symbols[j][w],word); 01179 n_symbols[j]++; 01180 } 01181 } 01182 else 01183 { 01184 /* it's the first time we find a symbol in this column */ 01185 symbols[j] = (char **)calloc(n_rows,sizeof(char *)); 01186 symbols[j][0] = (char *)calloc(strlen(word)+1,sizeof(char)); 01187 strcpy(symbols[j][0], word); 01188 n_symbols[j]=1; 01189 } 01190 } else { 01191 // This is a numerical character: we use it to keep track of the 01192 // maximum in this column. 01193 real real_val; 01194 if (the_val != "?" && pl_isnumber(the_val, &real_val)) { 01195 if (int(real_val) > (*max_in_col)[j]) { 01196 (*max_in_col)[j] = int(real_val); 01197 } 01198 } 01199 } 01200 word = cp+1; 01201 } 01202 } 01203 fclose(f); 01204 01205 /* sort the symbols */ 01206 for (j=0;j<n_cols;j++) 01207 if (symbols[j]) /* it has symbols */ 01208 qsort(symbols[j],n_symbols[j],sizeof(char *),compare_string_pointers); 01209 01210 Mat mat(n_rows,n_cols); 01211 /* NOW actually READ the data */ 01212 { 01213 p = mat.data(); 01214 f = fopen(filename.c_str(),"r"); 01215 01216 // skip one line if header present 01217 if (header_columns) { 01218 do { 01219 fgets(buffer,convert_UCIMLDB_BUF_LEN,f); 01220 } while (!feof(f) && (strcmp(buffer,"\n")==0 || strncmp(buffer,";;;",3)==0)); 01221 } 01222 01223 01224 for (i=0;i<n_rows;i++) 01225 { 01226 /* read a row */ 01227 do { 01228 fgets(buffer,convert_UCIMLDB_BUF_LEN,f); 01229 } while (!feof(f) && (strcmp(buffer,"\n")==0 || strncmp(buffer,";;;",3)==0)); 01230 01231 01232 // ignore everything after '|' 01233 char *comm = strchr(buffer,'|'); 01234 if (comm) *comm = '\n'; 01235 01236 line_len=strlen(buffer); 01237 cp=word=buffer; 01238 /* interpret a row */ 01239 for (j=0;j<n_cols;j++) 01240 { 01241 /* find end of word */ 01242 while ((*cp!=',' && *cp!='\n') && cp<=buffer+line_len) cp++; 01243 *cp=0; 01244 /* is this missing? */ 01245 if (*word == '?') 01246 *p = MISSING_VALUE; 01247 else { 01248 /* is this symbolic? */ 01249 bool is_symbolic = false; 01250 if (symbols[j]) { 01251 /* Try to read symbolic data */ 01252 int w=0; 01253 while (symbols[j][w] && /* look for this symbol */ 01254 strcmp(symbols[j][w],word)!=0 && 01255 w<n_symbols[j]) w++; 01256 if (w != n_rows && symbols[j][w]) { 01257 // The symbol does exist. 01258 is_symbolic = true; 01259 *p = w + (*max_in_col)[j] + 1; 01260 } 01261 } 01262 if (!is_symbolic) { 01263 /* read numeric data */ 01264 #ifdef USEDOUBLE 01265 sscanf(word,"%lf",p); 01266 #else 01267 sscanf(word,"%f",p); 01268 #endif 01269 } 01270 } 01271 word=cp+1; 01272 p++; 01273 } 01274 } 01275 fclose(f); 01276 } 01277 01278 if(!to_symbols) 01279 { 01280 for (int i=0; i<mat.width(); i++) 01281 { 01282 for (int j=0; j<n_symbols[i]; j++) 01283 free(symbols[i][j]); 01284 free(symbols[i]); 01285 } 01286 free(symbols); 01287 free(n_symbols); 01288 } 01289 if (!the_max_in_col) 01290 delete max_in_col; 01291 #undef convert_UCIMLDB_BUF_LEN 01292 01293 return mat; 01294 } 01295 01296 01297 // STATLOG machine-learning-database format 01298 Mat loadSTATLOG(const string& filename, char ****to_symbols, int **to_n_symbols) 01299 { 01300 FILE *f = fopen(filename.c_str(),"r"); 01301 int n_rows= -1, n_cols=0, i,j; 01302 char ***symbols; 01303 int *n_symbols; 01304 #define convert_STATLOG_BUF_LEN 10000 01305 char buffer[convert_STATLOG_BUF_LEN]; 01306 char *cp=buffer; 01307 char *word=buffer; 01308 char *cp2; 01309 real *p; 01310 int line_len; 01311 01312 if (!f) 01313 PLERROR("In loadSTATLOG, could not open file %s for reading",filename.c_str()); 01314 01315 if((to_symbols && !to_n_symbols) || (!to_symbols && to_n_symbols)) 01316 PLERROR("In loadUCIMLDB, to_symbols and to_nsymbols must both be provided (non-null), or both be 0"); 01317 01318 /* first figure out number of columns and number of rows */ 01319 01320 while (!feof(f)) 01321 { 01322 fgets(buffer,convert_STATLOG_BUF_LEN,f); 01323 if (n_rows == -1) 01324 { 01325 /* read number of columns */ 01326 while (*cp == ' ') 01327 cp++; /* jumping over blancs at the start of a new line */ 01328 while ( *cp!=0 && *cp!='\n' ) 01329 { 01330 while ( *cp != 0 && *cp != '\n' && *cp != ' ') 01331 { 01332 cp++; /* read one colomn */ 01333 } 01334 n_cols++; 01335 while ( *cp != 0 && *cp != '\n' && *cp == ' ') 01336 { 01337 cp++; /* jumping over blancs separating columns */ 01338 } 01339 } 01340 } 01341 n_rows++; 01342 } 01343 fclose(f); 01344 01345 01346 /* figure out the set of symbols used for each symbolic row, if any */ 01347 symbols = (char ***)calloc(n_cols,sizeof(char **)); 01348 n_symbols = (int *)calloc(n_cols,sizeof(int)); 01349 if (to_symbols) 01350 { 01351 *to_symbols = symbols; 01352 *to_n_symbols = n_symbols; 01353 } 01354 f = fopen(filename.c_str(),"r"); 01355 for (i=0;i<n_rows;i++) 01356 { 01357 fgets(buffer,convert_STATLOG_BUF_LEN,f); 01358 line_len=strlen(buffer); 01359 cp=word=buffer; 01360 for (j=0;j<n_cols;j++) 01361 { 01362 /* jumping over blancs at the start of a new line */ 01363 while (*cp == ' ') 01364 { 01365 cp++; 01366 word++; 01367 } 01368 /* find next end of word */ 01369 while ((*cp!=' ' && *cp!='\n') && cp<=buffer+line_len) cp++; 01370 *cp=0; 01371 /* is this symbolic? */ 01372 cp2=word; 01373 while (!isalpha((int)*cp2) && *cp2!='?' && cp2 < cp) cp2++; 01374 if (isalpha((int)*cp2) && *cp2!='?') 01375 { 01376 /* yes, non-misisng symbolic character was found: */ 01377 if (symbols[j]) { 01378 /* we already had found symbols in this column */ 01379 int w=0; 01380 while (symbols[j][w] && /* look for this symbol */ 01381 strcmp(symbols[j][w],word)!=0 && 01382 w<n_symbols[j]) w++; 01383 if (w==n_rows) 01384 PLERROR("logic error in loadSTATLOG"); 01385 if (!symbols[j][w]) 01386 { 01387 /* new symbol */ 01388 symbols[j][w] = (char *)calloc(strlen(word)+1,sizeof(char)); 01389 strcpy(symbols[j][w],word); 01390 n_symbols[j]++; 01391 } 01392 } 01393 else 01394 { 01395 /* it's the first time we find a symbol in this column */ 01396 symbols[j] = (char **)calloc(n_rows,sizeof(char *)); 01397 symbols[j][0] = (char *)calloc(strlen(word)+1,sizeof(char)); 01398 strcpy(symbols[j][0], word); 01399 n_symbols[j]=1; 01400 } 01401 } 01402 word = cp+1; 01403 } 01404 } 01405 fclose(f); 01406 01407 /* sort the symbols */ 01408 for (j=0;j<n_cols;j++) 01409 if (symbols[j]) /* it has symbols */ 01410 qsort(symbols[j],n_symbols[j],sizeof(char *),compare_string_pointers); 01411 01412 Mat mat(n_rows, n_cols); 01413 /* NOW actually READ the data */ 01414 { 01415 p = mat.data(); 01416 f = fopen(filename.c_str(),"r"); 01417 for (i=0;i<n_rows;i++) 01418 { 01419 /* read a row */ 01420 fgets(buffer,convert_STATLOG_BUF_LEN,f); 01421 line_len=strlen(buffer); 01422 cp=word=buffer; 01423 /* interpret a row */ 01424 for (j=0;j<n_cols;j++) 01425 { 01426 /* jumping over blancs at the start of a new line */ 01427 while (*cp == ' ') 01428 { 01429 cp++; 01430 word++; 01431 } 01432 /* find end of word */ 01433 while ((*cp!=' ' && *cp!='\n') && cp<=buffer+line_len) cp++; 01434 *cp=0; 01435 /* is this missing? */ 01436 if (*word == '?') 01437 *p = MISSING_VALUE; 01438 else 01439 /* is this symbolic? */ 01440 if (symbols[j]) { 01441 /* read symbolic data */ 01442 int w=0; 01443 while (symbols[j][w] && /* look for this symbol */ 01444 strcmp(symbols[j][w],word)!=0 && 01445 w<n_symbols[j]) w++; 01446 if (w==n_rows || !symbols[j][w]) 01447 PLERROR("logic error in loadSTATLOG"); 01448 *p = w; 01449 } 01450 else 01451 { 01452 /* read numeric data */ 01453 #ifdef USEDOUBLE 01454 sscanf(word,"%lf",p); 01455 #else 01456 sscanf(word,"%f",p); 01457 #endif 01458 } 01459 word=cp+1; 01460 p++; 01461 } 01462 } 01463 fclose(f); 01464 } 01465 01466 if(!to_symbols) 01467 { 01468 for (int i=0; i<mat.width(); i++) 01469 { 01470 for (int j=0; j<n_symbols[i]; j++) 01471 free(symbols[i][j]); 01472 free(symbols[i]); 01473 } 01474 free(symbols); 01475 free(n_symbols); 01476 } 01477 #undef convert_STATLOG_BUF_LEN 01478 01479 return mat; 01480 } 01481 01482 01483 // jpeg stuff... 01484 void loadJPEGrgb(const string& jpeg_filename, Mat& rgbmat, int& row_size, int scale) 01485 { 01486 string tmpfile = jpeg_filename + ".pnm"; 01487 char command[1000]; 01488 sprintf(command,"djpeg -pnm -scale 1/%d %s > %s", 01489 scale,jpeg_filename.c_str(),tmpfile.c_str()); 01490 system(command); 01491 FILE* fp = fopen(tmpfile.c_str(),"r"); 01492 if (!fp) 01493 PLERROR("reading %s",tmpfile.c_str()); 01494 fscanf(fp,"%s",command); 01495 int w,h; 01496 fscanf(fp,"%d %d\n",&w,&h); 01497 fscanf(fp,"%*d\n"); 01498 int n=w*h; 01499 rgbmat.resize(n,3); 01500 real *d=rgbmat.data(); 01501 for (int i=0;i<n;i++) 01502 for (int k=0;k<3;k++,d++) 01503 *d =(real)(getc(fp)); 01504 fclose(fp); 01505 sprintf(command,"rm %s",tmpfile.c_str()); 01506 system(command); 01507 row_size = w; 01508 } 01509 01510 void parseSizeFromRemainingLines(const string& filename, ifstream& in, bool& could_be_old_amat, int& length, int& width) 01511 { 01512 string line; 01513 getNextNonBlankLine(in,line); 01514 if(line=="") // There are no value lines 01515 { 01516 width=length=0; 01517 could_be_old_amat=false; 01518 return; 01519 } 01520 01521 int nfields1 = int(split(line).size()); 01522 getNextNonBlankLine(in,line); 01523 if(line=="") // There is only one line 01524 { 01525 length = 1; 01526 width = nfields1; 01527 could_be_old_amat = false; 01528 return; 01529 } 01530 01531 // The number of lines that seems to contain values 01532 int guesslength = countNonBlankLinesOfFile(filename); 01533 01534 int nfields2 = int(split(line).size()); // The width of the second line. 01535 if(nfields1==nfields2) // looks like a plain ascii file 01536 { 01537 length = guesslength; 01538 width = nfields1; 01539 return; 01540 } 01541 01542 if(!could_be_old_amat || nfields1!=2) 01543 return; // could not be an old .amat with first 2 numbers being length width 01544 01545 // Get to the beggining of the file 01546 in.seekg(0); 01547 in.clear(); 01548 01549 // Reread the first line as to real numbers 01550 real a = -1.0, b = -1.0; 01551 in >> a >> b; 01552 01553 01554 if(guesslength == int(a)+1 // +1 since the size line was counted in guesslength but should not 01555 && real(int(a))==a && real(int(b))==b // Sizes must be integers and 01556 && a>0 && b>0 // positive 01557 && int(b)==nfields2 ) // The first row of values has the expected width 01558 { 01559 // We assume we have an old style .amat 01560 length = int(a); 01561 width = int(b); 01562 } 01563 } 01564 01565 } // end of namespace PLearn

Generated on Tue Aug 17 15:58:09 2004 for PLearn by doxygen 1.3.7