Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

IntStream.cc

Go to the documentation of this file.
00001 // -*- C++ -*- 00002 00003 // PLearn (A C++ Machine Learning Library) 00004 // Copyright (C) 1998 Pascal Vincent 00005 // Copyright (C) 1999-2002 Pascal Vincent, Yoshua Bengio and University of Montreal 00006 // 00007 00008 // Redistribution and use in source and binary forms, with or without 00009 // modification, are permitted provided that the following conditions are met: 00010 // 00011 // 1. Redistributions of source code must retain the above copyright 00012 // notice, this list of conditions and the following disclaimer. 00013 // 00014 // 2. Redistributions in binary form must reproduce the above copyright 00015 // notice, this list of conditions and the following disclaimer in the 00016 // documentation and/or other materials provided with the distribution. 00017 // 00018 // 3. The name of the authors may not be used to endorse or promote 00019 // products derived from this software without specific prior written 00020 // permission. 00021 // 00022 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00023 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00024 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00025 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00026 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00027 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00028 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00029 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00030 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00031 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00032 // 00033 // This file is part of the PLearn library. For more information on the PLearn 00034 // library, go to the PLearn Web site at www.plearn.org 00035 00036 /* IntStream.cc */ 00037 00038 #include "IntStream.h" 00039 //#include <sys/stat.h> 00040 00041 #ifndef _MSC_VER 00042 #include <unistd.h> 00043 #else 00044 #include <cstdlib> 00045 #endif 00046 00047 //#include <cerrno> 00048 00049 // I am commenting this out because the compiler complained of previous declarations fronm stdlib.h throwing different exceptions!!! (Pascal) 00050 // #include <malloc.h> 00051 00052 namespace PLearn { 00053 using namespace std; 00054 00055 IntStreamVMatrix::IntStreamVMatrix(IntStream& s, int window_size, int dummyinput, int eos) 00056 : RowBufferedVMatrix(s.size(),window_size), stream(&s), position(-1), 00057 dummy_input(dummyinput), end_of_sequence_symbol(eos) 00058 { 00059 current_row.fill(dummy_input); 00060 } 00061 00062 void IntStreamVMatrix::getRow(int i, Vec v) const 00063 { 00064 if (i==position+1) // most frequent case 00065 { 00066 int j=1; 00067 if (current_row[width()-1]==end_of_sequence_symbol) 00068 { 00069 for (;j<width()-1;j++) current_row[j-1]=dummy_input; 00070 current_row[j++ - 1]=end_of_sequence_symbol; 00071 } 00072 else 00073 for (;j<width();j++) 00074 current_row[j-1]=current_row[j]; 00075 #if USE_JS_HACK 00076 { 00077 real next=stream->next(); 00078 if (next>=MAX_VOC_SIZE) next = MAX_VOC_SIZE-1; 00079 current_row[j-1] = next; 00080 } 00081 #else 00082 current_row[j-1]=stream->next(); 00083 #endif 00084 } 00085 else if (i!=position) 00086 { 00087 if (i>=width()) 00088 { 00089 stream->seek(i-width()+1); 00090 for (int j=0;j<width();j++) 00091 #if USE_JS_HACK 00092 { 00093 real next=stream->next(); 00094 if (next>=MAX_VOC_SIZE) next = MAX_VOC_SIZE-1; 00095 current_row[j] = next; 00096 } 00097 #else 00098 current_row[j]=stream->next(); 00099 #endif 00100 } else 00101 { 00102 if (i<0) PLERROR("IntStreamVMat::getRow at row %d < 0!",i); 00103 stream->seek(0); 00104 int j=0; 00105 for (;j<width()-i-1;j++) 00106 current_row[j]=dummy_input; 00107 for (;j<width();j++) 00108 #if USE_JS_HACK 00109 { 00110 real next=stream->next(); 00111 if (next>=MAX_VOC_SIZE) next = MAX_VOC_SIZE-1; 00112 current_row[j] = next; 00113 } 00114 #else 00115 current_row[j]=stream->next(); 00116 #endif 00117 } 00118 } 00119 position=i; 00120 v << current_row; 00121 } 00122 00123 FilesIntStream::FilesIntStream(int nfiles, const char* files[]) 00124 : IntStream(-1), n_files(nfiles), file_names(files), current_file(0), 00125 next_pos_in_current_file(0) 00126 { 00127 fp=(FILE**)malloc(n_files*sizeof(FILE*)); 00128 sizes=(int*)calloc(n_files,sizeof(int)); 00129 total_size=0; 00130 for (int i=0;i<n_files;i++) { 00131 fp[i]=fopen(file_names[i],"r"); 00132 if (!fp[i]) 00133 PLERROR("FilesIntStream::FilesIntStream, can't open file %s\n",file_names[i]); 00134 if (fseek(fp[i],0,SEEK_END)) 00135 PLERROR("In FileIntStream constructor: fseek(%s,0,SEEK_END) failed\n",file_names[i]); 00136 total_size+=(sizes[i] = (ftell(fp[i])/sizeof(int))); 00137 fseek(fp[i],0,SEEK_SET); 00138 } 00139 read_current(); 00140 } 00141 00142 void FilesIntStream::reopen() 00143 { 00144 // re-open all the file pointers 00145 for (int i=0;i<n_files;i++) { 00146 fp[i]=fopen(file_names[i],"r"); 00147 if (!fp[i]) 00148 PLERROR("FilesIntStream::reopen, can't open file %s\n",file_names[i]); 00149 fseek(fp[i],0,SEEK_SET); 00150 } 00151 // return to same position as previously 00152 seek(pos); 00153 } 00154 00155 // read from current current_file at next_pos_in_current_file into current_value, 00156 // and increment next_pos_in_current_file 00157 void FilesIntStream::read_current() 00158 { 00159 if (n_files<1) PLERROR("FilesIntStream::read_current(): no file opened"); 00160 if (pos==total_size) { 00161 seek(0); 00162 return; 00163 } 00164 if (next_pos_in_current_file==sizes[current_file]) { 00165 next_pos_in_current_file = 0; 00166 current_file++; 00167 if (current_file==n_files) 00168 { seek(0); return; } 00169 } 00170 if (fread(&current_value,sizeof(int),1,fp[current_file])!=1) { 00171 int posit=ftell(fp[current_file]); 00172 00173 // norman: added check. Can be done better 00174 #ifdef WIN32 00175 fprintf(stderr,"process could not read 1 int from %s at position %d, ftell=%d\nerrno=%d,%s", 00176 file_names[current_file],next_pos_in_current_file+1, 00177 posit,errno,strerror(errno)); 00178 #else 00179 int pid=getpid(); 00180 fprintf(stderr,"process %d could not read 1 int from %s at position %d, ftell=%d\nerrno=%d,%s", 00181 pid,file_names[current_file],next_pos_in_current_file+1, 00182 posit,errno,strerror(errno)); 00183 #endif 00184 00185 exit(1); 00186 } 00187 #ifdef BIGENDIAN 00188 reverse_int(&current_value,1); 00189 #endif 00190 next_pos_in_current_file++; 00191 pos++; 00192 } 00193 00194 // move to given position 00195 void FilesIntStream::seek(long position) 00196 { 00197 if (position<0 || position>=total_size) { 00198 fprintf(stderr,"FilesIntStream::seek(%ld), argument must be in [0,%d)\n", 00199 position,total_size); 00200 exit(1); 00201 } 00202 pos=0; 00203 int i; 00204 for (i=0;i<n_files-1 && position>=pos+sizes[i];i++) pos+=sizes[i]; 00205 next_pos_in_current_file=position-pos; 00206 for (int j=0;j<n_files;j++) { 00207 int p = (i==j)?next_pos_in_current_file*sizeof(int):0; 00208 if (fseek(fp[j],p,SEEK_SET)) 00209 PLERROR("In FileIntStream::seek fseek(%s,%d,SEEK_SET) failed\n",file_names[j],next_pos_in_current_file); 00210 } 00211 current_file=i; 00212 // pos will be incremented in read_current() 00213 pos=position-1; 00214 read_current(); 00215 } 00216 00217 // return next available int from the stream and increment position 00218 int FilesIntStream::next() 00219 { 00220 int c=current_value; 00221 read_current(); 00222 return c; 00223 } 00224 00225 // return next available int from the stream 00226 int FilesIntStream::current() 00227 { 00228 return current_value; 00229 } 00230 00231 // total length of the stream 00232 long FilesIntStream::size() 00233 { 00234 return total_size; 00235 } 00236 00237 FilesIntStream::~FilesIntStream() 00238 { 00239 for (int i=0;i<n_files;i++) 00240 fclose(fp[i]); 00241 free(fp); 00242 free(sizes); 00243 } 00244 00245 // ****************************************************** 00246 // convert <word_sequences> filename into a FilesIntStream stream 00247 FilesIntStream* word_sequences2files_int_stream(const char* word_sequences_file) 00248 { 00249 FILE* word_sequences_fp=fopen(word_sequences_file,"r"); 00250 if (!word_sequences_fp) 00251 PLERROR("word_sequences2files_int_stream: can't open file %s",word_sequences_file); 00252 typedef const char* cstring; 00253 const char** word_sequences = new cstring[1000]; 00254 int n_word_sequences=0; 00255 char buffer[1000]; 00256 while (!feof(word_sequences_fp)) { 00257 if (!fgets(buffer,1000,word_sequences_fp)) break; 00258 int l=(int)strlen(buffer); 00259 if (buffer[l-1]=='\n') buffer[l-1]='\0'; 00260 word_sequences[n_word_sequences]=tostring(buffer).c_str(); 00261 n_word_sequences++; 00262 } 00263 fclose(word_sequences_fp); 00264 return new FilesIntStream(n_word_sequences,word_sequences); 00265 } 00266 00267 InMemoryIntStream::InMemoryIntStream(IntStream& stream) 00268 { 00269 length = stream.size(); 00270 data = new int[length]; 00271 for (int i=0;i<length;i++) 00272 data[i] = stream.next(); 00273 } 00274 00275 } // end of namespace PLearn 00276

Generated on Tue Aug 17 15:55:40 2004 for PLearn by doxygen 1.3.7