Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

DiskVMatrix.cc

Go to the documentation of this file.
00001 // -*- C++ -*- 00002 00003 // PLearn (A C++ Machine Learning Library) 00004 // Copyright (C) 1998 Pascal Vincent 00005 // Copyright (C) 1999-2001 Pascal Vincent, Yoshua Bengio, Rejean Ducharme and University of Montreal 00006 // Copyright (C) 2002 Pascal Vincent, Julien Keable, Xavier Saint-Mleux 00007 // 00008 // Redistribution and use in source and binary forms, with or without 00009 // modification, are permitted provided that the following conditions are met: 00010 // 00011 // 1. Redistributions of source code must retain the above copyright 00012 // notice, this list of conditions and the following disclaimer. 00013 // 00014 // 2. Redistributions in binary form must reproduce the above copyright 00015 // notice, this list of conditions and the following disclaimer in the 00016 // documentation and/or other materials provided with the distribution. 00017 // 00018 // 3. The name of the authors may not be used to endorse or promote 00019 // products derived from this software without specific prior written 00020 // permission. 00021 // 00022 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00023 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00024 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00025 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00026 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00027 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00028 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00029 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00030 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00031 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00032 // 00033 // This file is part of the PLearn library. For more information on the PLearn 00034 // library, go to the PLearn Web site at www.plearn.org 00035 00036 00037 /* ******************************************************* 00038 * $Id: DiskVMatrix.cc,v 1.18 2004/07/21 16:30:55 chrish42 Exp $ 00039 ******************************************************* */ 00040 00041 #include <errno.h> 00042 #include "DiskVMatrix.h" 00043 #include <plearn/io/pl_io.h> 00044 #include <errno.h> 00045 00046 namespace PLearn { 00047 using namespace std; 00048 00049 #ifdef WIN32 00050 #include <io.h> 00051 #define unlink _unlink 00052 #endif 00053 00055 /* Format description 00056 00057 The directory in variable dirname, which should end in .dmat, 00058 contains a file named 'indexfile' and data files of up to roughly 00059 500Meg each named 0.data, 1.data, ... 00060 00061 The indexfile has a 4 byte header, of which currently only the first byte is checked. 00062 'L' means new little-endian format 00063 'B' means new big-endian format 00064 An ascii code of 16 means old little-endian format 00065 00066 In the new formats, the last 3 bytes of the 4 byte header are ' ' 00067 00068 All other raw-binary data (whether in the indexfile or the data files) 00069 is written in the specified endianness (and swapping will be performed if 00070 reading it on a machine with a different endianness). 00071 00072 Following the 4-byte header are two 4-byte ints: length and width of the matrix 00073 00074 Following are 5 bytes for each row of the matrix. 00075 The first of those 5 bytes is an unsigned byte indicating the number (0-255) of 00076 the data file in which to find the row. 00077 The remaining 4 bytes are an unsigned int indicating the start position of the row in 00078 that data file (as passed to fseek). 00079 00080 The row is encoded in the thus indicated data file at the indicated position, 00081 using the new vector compression format (see new_read_compressed) 00082 [ the old compression format pf binread_compressed is supported 00083 for backward compatibility only. ] 00084 00085 */ 00086 00087 DiskVMatrix::DiskVMatrix() 00088 : indexf(0),freshnewfile(false), 00089 old_format(false),swap_endians(false), 00090 tolerance(1e-6) 00091 { 00092 writable = false; 00093 } 00094 00095 DiskVMatrix::DiskVMatrix(const string& the_dirname, bool readwrite) 00096 : indexf(0),freshnewfile(false), 00097 old_format(false),swap_endians(false), 00098 dirname(remove_trailing_slash(the_dirname)), 00099 tolerance(1e-6) 00100 { 00101 writable = readwrite; 00102 build_(); 00103 } 00104 00105 DiskVMatrix::DiskVMatrix(const string& the_dirname, int the_width, bool write_double_as_float) 00106 : RowBufferedVMatrix(0,the_width), 00107 indexf(0), 00108 freshnewfile(true), 00109 old_format(false),swap_endians(false), 00110 dirname(remove_trailing_slash(the_dirname)), 00111 tolerance(1e-6) 00112 { 00113 writable = true; 00114 build_(); 00115 } 00116 00117 void DiskVMatrix::build() 00118 { 00119 inherited::build(); 00120 build_(); 00121 } 00122 00123 void DiskVMatrix::build_() 00124 { 00125 if(!freshnewfile) 00126 { 00127 if(!isdir(dirname)) 00128 PLERROR("In DiskVMatrix constructor, directory %s could not be found",dirname.c_str()); 00129 setMetaDataDir(dirname + ".metadata"); 00130 setMtime(mtime(append_slash(dirname)+"indexfile")); 00131 string omode; 00132 if(writable) 00133 omode = "r+b"; 00134 else // read-only 00135 omode = "rb"; 00136 00137 string indexfname = dirname+slash+"indexfile"; 00138 indexf = fopen(indexfname.c_str(), omode.c_str()); 00139 if(!indexf) 00140 PLERROR("In DiskVMatrix constructor, could not open file %s in specified mode", indexfname.c_str()); 00141 00142 unsigned char header[4]; 00143 fread(header,1,4,indexf); 00144 if(header[0]=='L' || header[0]=='B') 00145 { // New format 00146 old_format = false; 00147 swap_endians = (header[0]!=byte_order()); 00148 } 00149 else if(header[0]==16) 00150 { // Old format 00151 old_format = true; 00152 if(byte_order()!='L') 00153 PLERROR("Old format DiskVMatrix can only be read from a little-endian machine.\n" 00154 "Convert it to a new format on a little-endian machine prior to attempt\n" 00155 "using it from a big endian machine.\n"); 00156 swap_endians = false; 00157 } 00158 else 00159 { 00160 PLERROR("Wrong header byte in index file %s: ascii code %d\n" 00161 "(should be 'L' or 'B' or '...')\n", indexfname.c_str(), header[0]); 00162 } 00163 00164 fread(&length_,sizeof(int),1,indexf); 00165 fread(&width_,sizeof(int),1,indexf); 00166 if(swap_endians) 00167 { 00168 endianswap(&length_); 00169 endianswap(&width_); 00170 } 00171 int k=0; 00172 string fname = dirname+slash+tostring(k)+".data"; 00173 while(isfile(fname)) 00174 { 00175 FILE* f = fopen(fname.c_str(), omode.c_str()); 00176 if(!f) 00177 PLERROR("In DiskVMatrix constructor, could not open file %s in specified mode", fname.c_str()); 00178 dataf.append(f); 00179 fname = dirname+slash+tostring(++k)+".data"; 00180 } 00181 // Stuff related to RowBufferedVMatrix, for consistency 00182 current_row_index = -1; 00183 current_row.resize(width_); 00184 other_row_index = -1; 00185 other_row.resize(width_); 00186 00187 //resize the string mappings 00188 map_sr = TVec<map<string,real> >(width_); 00189 map_rs = TVec<map<real,string> >(width_); 00190 00191 getFieldInfos(); 00192 if (writable) 00193 fseek(indexf, 0, SEEK_END); 00194 } 00195 else 00196 { 00197 if(isdir(dirname)) 00198 PLERROR("In DiskVMatrix constructor (with specified width), directory %s already exists",dirname.c_str()); 00199 setMetaDataDir(dirname + ".metadata"); 00200 setMtime(mtime(append_slash(dirname)+"indexfile")); 00201 00202 if(isfile(dirname)) // patch for running mkstemp (TmpFilenames) 00203 unlink(dirname.c_str()); 00204 if(!force_mkdir(dirname)) // force directory creation 00205 PLERROR("In DiskVMatrix constructor (with specified width), could not create directory %s Error was: %s",dirname.c_str(), strerror(errno)); 00206 00207 string indexfname = dirname + slash + "indexfile"; 00208 indexf = fopen(indexfname.c_str(),"w+b"); 00209 00210 char header[4]; 00211 header[0] = byte_order(); 00212 header[1] = ' '; 00213 header[2] = ' '; 00214 header[3] = ' '; 00215 fwrite(header,1,4,indexf); 00216 fwrite((char*)&length_,sizeof(int),1,indexf); 00217 fwrite((char*)&width_,sizeof(int),1,indexf); 00218 00219 string fname = dirname + slash + "0.data"; 00220 FILE* f = fopen(fname.c_str(), "w+b"); 00221 dataf.append(f); 00222 } 00223 freshnewfile=false; 00224 } 00225 00226 void DiskVMatrix::declareOptions(OptionList &ol) 00227 { 00228 declareOption(ol, "dirname", &DiskVMatrix::dirname, OptionBase::buildoption, "Directory name of the.dmat"); 00229 declareOption(ol, "tolerance", &DiskVMatrix::tolerance, OptionBase::buildoption, "The absolute error tolerance for storing doubles as floats"); 00230 inherited::declareOptions(ol); 00231 } 00232 00233 void DiskVMatrix::getNewRow(int i, const Vec& v) const 00234 { 00235 #ifdef BOUNDCHECK 00236 if(i<0 || i>length()) 00237 PLERROR("In DiskVMatrix::getNewRow, bad row number %d",i); 00238 if(v.length() != width()) 00239 PLERROR("In DiskVMatrix::getNewRow, length of v (%d) does not match matrix width (%d)",v.length(),width()); 00240 #endif 00241 00242 unsigned char filenum; 00243 unsigned int position; 00244 fseek(indexf,3*sizeof(int) + i*(sizeof(unsigned char)+sizeof(unsigned int)), SEEK_SET); 00245 fread(&filenum,sizeof(unsigned char),1,indexf); 00246 fread(&position,sizeof(unsigned int),1,indexf); 00247 if(swap_endians) 00248 endianswap(&position); 00249 FILE* f = dataf[int(filenum)]; 00250 fseek(f,position,SEEK_SET); 00251 if(old_format) 00252 binread_compressed(f,v.data(),v.length()); 00253 else 00254 new_read_compressed(f, v.data(), v.length(), swap_endians); 00255 } 00256 00257 void DiskVMatrix::putRow(int i, Vec v) 00258 { 00259 PLERROR("putRow cannot in general be correctly and efficiently implemented for a DiskVMatrix.\n" 00260 "Use appendRow if you wish to write more rows."); 00261 } 00262 00263 void DiskVMatrix::appendRow(Vec v) 00264 { 00265 if(!writable) 00266 PLERROR("In DiskVMatrix::appendRow cannot append row in read only mode, set readwrite parameter to true when calling the constructor"); 00267 if(v.length() != width()) 00268 PLERROR("In DiskVMatrix::appendRow, length of v (%d) does not match matrix width (%d)",v.length(),width()); 00269 00270 int filenum = dataf.size()-1; 00271 FILE* f = dataf[filenum]; 00272 fseek(f,0,SEEK_END); 00273 unsigned int position = (unsigned int)ftell(f); 00274 if(position>500000000L) 00275 { 00276 fflush(f); 00277 filenum++; 00278 string filename = dirname + slash + tostring(filenum) + ".data"; 00279 f = fopen(filename.c_str(), "w+b"); 00280 dataf.append(f); 00281 position = 0; 00282 } 00283 if(old_format) 00284 binwrite_compressed(f,v.data(),v.length()); 00285 else 00286 new_write_compressed(f, v.data(),v.length(), tolerance, swap_endians); 00287 00288 fseek(indexf,0,SEEK_END); 00289 fputc(filenum,indexf); 00290 fwrite((char*)&position,sizeof(unsigned int),1,indexf); 00291 length_++; 00292 fseek(indexf,sizeof(int),SEEK_SET); 00293 int le = length_; 00294 if(swap_endians) 00295 endianswap(&le); 00296 fwrite(&le,sizeof(int),1,indexf); 00297 } 00298 00299 void DiskVMatrix::flush() 00300 { 00301 int filenum = dataf.size()-1; 00302 FILE* f = dataf[filenum]; 00303 fflush(f); 00304 fflush(indexf); 00305 } 00306 00307 DiskVMatrix::~DiskVMatrix() 00308 { 00309 for(int i=0; i<dataf.size(); i++) 00310 { 00311 if(dataf[i]) 00312 fclose(dataf[i]); 00313 } 00314 00315 if(indexf) 00316 fclose(indexf); 00317 00318 saveFieldInfos(); 00319 } 00320 00321 PLEARN_IMPLEMENT_OBJECT(DiskVMatrix, "ONE LINE DESCR", "NO HELP"); 00322 00323 #ifdef WIN32 00324 #undef unlink 00325 #endif 00326 00327 } // end of namespcae PLearn

Generated on Tue Aug 17 15:51:24 2004 for PLearn by doxygen 1.3.7