00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
#include "IntStream.h"
00039
00040
00041
#ifndef _MSC_VER
00042
#include <unistd.h>
00043
#else
00044
#include <cstdlib>
00045
#endif
00046
00047
00048
00049
00050
00051
00052
namespace PLearn {
00053
using namespace std;
00054
00055 IntStreamVMatrix::IntStreamVMatrix(
IntStream& s,
int window_size,
int dummyinput,
int eos)
00056 :
RowBufferedVMatrix(s.size(),window_size), stream(&s), position(-1),
00057 dummy_input(dummyinput), end_of_sequence_symbol(eos)
00058 {
00059 current_row.
fill(
dummy_input);
00060 }
00061
00062 void IntStreamVMatrix::getRow(
int i,
Vec v)
const
00063
{
00064
if (i==
position+1)
00065 {
00066
int j=1;
00067
if (current_row[
width()-1]==
end_of_sequence_symbol)
00068 {
00069
for (;j<
width()-1;j++) current_row[j-1]=
dummy_input;
00070 current_row[j++ - 1]=
end_of_sequence_symbol;
00071 }
00072
else
00073
for (;j<
width();j++)
00074 current_row[j-1]=current_row[j];
00075
#if USE_JS_HACK
00076
{
00077
real next=
stream->next();
00078
if (next>=MAX_VOC_SIZE) next = MAX_VOC_SIZE-1;
00079 current_row[j-1] = next;
00080 }
00081
#else
00082
current_row[j-1]=
stream->next();
00083
#endif
00084
}
00085
else if (i!=
position)
00086 {
00087
if (i>=
width())
00088 {
00089
stream->seek(i-
width()+1);
00090
for (
int j=0;j<
width();j++)
00091
#if USE_JS_HACK
00092
{
00093
real next=
stream->next();
00094
if (next>=MAX_VOC_SIZE) next = MAX_VOC_SIZE-1;
00095 current_row[j] = next;
00096 }
00097
#else
00098
current_row[j]=
stream->next();
00099
#endif
00100
}
else
00101 {
00102
if (i<0)
PLERROR(
"IntStreamVMat::getRow at row %d < 0!",i);
00103
stream->seek(0);
00104
int j=0;
00105
for (;j<
width()-i-1;j++)
00106 current_row[j]=
dummy_input;
00107
for (;j<
width();j++)
00108
#if USE_JS_HACK
00109
{
00110
real next=
stream->next();
00111
if (next>=MAX_VOC_SIZE) next = MAX_VOC_SIZE-1;
00112 current_row[j] = next;
00113 }
00114
#else
00115
current_row[j]=
stream->next();
00116
#endif
00117
}
00118 }
00119
position=i;
00120 v << current_row;
00121 }
00122
00123 FilesIntStream::FilesIntStream(
int nfiles,
const char* files[])
00124 :
IntStream(-1), n_files(nfiles), file_names(files), current_file(0),
00125 next_pos_in_current_file(0)
00126 {
00127
fp=(FILE**)malloc(
n_files*
sizeof(FILE*));
00128
sizes=(
int*)calloc(
n_files,
sizeof(
int));
00129
total_size=0;
00130
for (
int i=0;i<
n_files;i++) {
00131
fp[i]=fopen(
file_names[i],
"r");
00132
if (!
fp[i])
00133
PLERROR(
"FilesIntStream::FilesIntStream, can't open file %s\n",
file_names[i]);
00134
if (fseek(
fp[i],0,SEEK_END))
00135
PLERROR(
"In FileIntStream constructor: fseek(%s,0,SEEK_END) failed\n",
file_names[i]);
00136
total_size+=(
sizes[i] = (ftell(
fp[i])/
sizeof(
int)));
00137 fseek(
fp[i],0,SEEK_SET);
00138 }
00139
read_current();
00140 }
00141
00142 void FilesIntStream::reopen()
00143 {
00144
00145
for (
int i=0;i<
n_files;i++) {
00146
fp[i]=fopen(
file_names[i],
"r");
00147
if (!
fp[i])
00148
PLERROR(
"FilesIntStream::reopen, can't open file %s\n",
file_names[i]);
00149 fseek(
fp[i],0,SEEK_SET);
00150 }
00151
00152
seek(pos);
00153 }
00154
00155
00156
00157 void FilesIntStream::read_current()
00158 {
00159
if (
n_files<1)
PLERROR(
"FilesIntStream::read_current(): no file opened");
00160
if (pos==
total_size) {
00161
seek(0);
00162
return;
00163 }
00164
if (
next_pos_in_current_file==
sizes[
current_file]) {
00165
next_pos_in_current_file = 0;
00166
current_file++;
00167
if (
current_file==
n_files)
00168 {
seek(0);
return; }
00169 }
00170
if (fread(&
current_value,
sizeof(
int),1,
fp[
current_file])!=1) {
00171
int posit=ftell(
fp[current_file]);
00172
00173
00174
#ifdef WIN32
00175
fprintf(stderr,
"process could not read 1 int from %s at position %d, ftell=%d\nerrno=%d,%s",
00176
file_names[current_file],
next_pos_in_current_file+1,
00177 posit,errno,strerror(errno));
00178
#else
00179
int pid=getpid();
00180 fprintf(stderr,
"process %d could not read 1 int from %s at position %d, ftell=%d\nerrno=%d,%s",
00181 pid,
file_names[current_file],
next_pos_in_current_file+1,
00182 posit,errno,strerror(errno));
00183
#endif
00184
00185 exit(1);
00186 }
00187
#ifdef BIGENDIAN
00188
reverse_int(&
current_value,1);
00189
#endif
00190
next_pos_in_current_file++;
00191 pos++;
00192 }
00193
00194
00195 void FilesIntStream::seek(
long position)
00196 {
00197
if (position<0 || position>=
total_size) {
00198 fprintf(stderr,
"FilesIntStream::seek(%ld), argument must be in [0,%d)\n",
00199 position,
total_size);
00200 exit(1);
00201 }
00202 pos=0;
00203
int i;
00204
for (i=0;i<
n_files-1 && position>=pos+
sizes[i];i++) pos+=sizes[i];
00205
next_pos_in_current_file=position-pos;
00206
for (
int j=0;j<
n_files;j++) {
00207
int p = (i==j)?
next_pos_in_current_file*
sizeof(
int):0;
00208
if (fseek(
fp[j],p,SEEK_SET))
00209
PLERROR(
"In FileIntStream::seek fseek(%s,%d,SEEK_SET) failed\n",
file_names[j],
next_pos_in_current_file);
00210 }
00211
current_file=i;
00212
00213 pos=position-1;
00214
read_current();
00215 }
00216
00217
00218 int FilesIntStream::next()
00219 {
00220
int c=
current_value;
00221
read_current();
00222
return c;
00223 }
00224
00225
00226 int FilesIntStream::current()
00227 {
00228
return current_value;
00229 }
00230
00231
00232 long FilesIntStream::size()
00233 {
00234
return total_size;
00235 }
00236
00237 FilesIntStream::~FilesIntStream()
00238 {
00239
for (
int i=0;i<
n_files;i++)
00240 fclose(
fp[i]);
00241 free(
fp);
00242 free(
sizes);
00243 }
00244
00245
00246
00247 FilesIntStream*
word_sequences2files_int_stream(
const char* word_sequences_file)
00248 {
00249 FILE* word_sequences_fp=fopen(word_sequences_file,
"r");
00250
if (!word_sequences_fp)
00251
PLERROR(
"word_sequences2files_int_stream: can't open file %s",word_sequences_file);
00252
typedef const char* cstring;
00253
const char** word_sequences =
new cstring[1000];
00254
int n_word_sequences=0;
00255
char buffer[1000];
00256
while (!feof(word_sequences_fp)) {
00257
if (!fgets(buffer,1000,word_sequences_fp))
break;
00258
int l=(
int)
strlen(buffer);
00259
if (buffer[l-1]==
'\n') buffer[l-1]=
'\0';
00260 word_sequences[n_word_sequences]=
tostring(buffer).c_str();
00261 n_word_sequences++;
00262 }
00263 fclose(word_sequences_fp);
00264
return new FilesIntStream(n_word_sequences,word_sequences);
00265 }
00266
00267 InMemoryIntStream::InMemoryIntStream(
IntStream& stream)
00268 {
00269
length = stream.
size();
00270
data =
new int[
length];
00271
for (
int i=0;i<
length;i++)
00272
data[i] = stream.
next();
00273 }
00274
00275 }
00276