00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
#include "SDBWithStats.h"
00046
00047
namespace PLearn {
00048
using namespace std;
00049
00050 int FieldStat::max_nsymbols = 400;
00051
00052 void FieldStat::updateString(
const string& sym)
00053 {
00054 ++
nonmissing_;
00055
if(
nsymbols()<
max_nsymbols)
00056
symbolcount[sym]++;
00057 }
00058
00059 void FieldStat::updateNumber(
double d)
00060 {
00061 ++
nonmissing_;
00062
if(
nsymbols()<
max_nsymbols)
00063
symbolcount[
tostring(d)]++;
00064
sum_ += d;
00065
sumsquare_ += d*d;
00066
if(d<
min_)
00067
min_ = d;
00068
if(d>
max_)
00069
max_ = d;
00070 }
00071
00072 void FieldStat::clear()
00073 {
00074
nonmissing_ = 0;
00075
missing_ = 0;
00076
sum_ = 0;
00077
sumsquare_ = 0;
00078
min_ = FLT_MAX;
00079
max_ = -FLT_MAX;
00080
mean_ = 0;
00081
stddev_ = 0;
00082
symbolcount.clear();
00083
symbolid.clear();
00084 }
00085
00086 void FieldStat::finalize()
00087 {
00088
mean_ =
sum_/
nonmissing_;
00089
double meansquare_ =
sumsquare_/
nonmissing_;
00090
stddev_ =
sqrt( meansquare_ -
square(
mean_) );
00091
if(
nsymbols()>=
max_nsymbols)
00092 {
00093
symbolcount.clear();
00094
symbolid.clear();
00095 }
00096 }
00097
00098 SDBWithStats::SDBWithStats(
string basename,
string path, AccessType access,
bool verbose)
00099 :
SDB(basename,path,access,verbose)
00100 {
00101
fieldstat.resize(
getSchema().
size());
00102
if(
hasStats())
00103
loadStats();
00104 }
00105
00106 void SDBWithStats::forgetStats()
00107 {
00108
for(
int j=0; j<
width(); j++)
00109
getStat(j).
clear();
00110 }
00111
00112 void SDBWithStats::computeStats(
unsigned int nrows)
00113 {
00114
forgetStats();
00115
const Schema& sc =
getSchema();
00116
Row row(&sc);
00117
00118
for(
SDB::RowNumber i=0; i<nrows;i++)
00119 {
00120 getInRow(i, row);
00121
Row::const_iterator it = row.
begin();
00122
if (nrows>100000 && i%100000==0)
00123 cout <<
"SDBWithStats::computeStats processing row " << i <<
" of " << nrows <<
endl;
00124
00125
for (
int j=0; j<
nfields(); ++j, ++it)
00126 {
00127
if(it.
isMissing())
00128
fieldstat[j].updateMissing();
00129
else
00130 {
00131
switch(it.
getFieldType())
00132 {
00133
case StringType:
00134
case CharacterType:
00135
fieldstat[j].updateString(
tostring(it));
00136
break;
00137
case SignedCharType:
00138
case ShortType:
00139
case IntType:
00140
case FloatType:
00141
case DoubleType:
00142
case DateType:
00143
fieldstat[j].updateNumber(
todouble(it));
00144
break;
00145
default:
00146
PLERROR(
"Unknown field type");
00147 }
00148 }
00149 }
00150 }
00151 cout <<
"boucle terminee" <<
endl;
00152
for (
int j=0; j<
nfields(); ++j)
00153
fieldstat[j].finalize();
00154 cout <<
"fini computestats" <<
endl;
00155 }
00156
00157 bool SDBWithStats::hasStats()
00158 {
00159
string numstatsfile =
getPath()+
getName()+
".stats";
00160
string symstatsfile =
getPath()+
getName()+
".symbols";
00161
return file_exists(numstatsfile.c_str()) &&
file_exists(symstatsfile.c_str());
00162 }
00163
00164 void SDBWithStats::saveStats()
00165 {
00166
string numstatsfile =
getPath()+
getName()+
".stats";
00167
string symstatsfile =
getPath()+
getName()+
".symbols";
00168
00169 ofstream numstats(numstatsfile.c_str());
00170
if(!numstats)
00171
PLERROR(
"could not open file %s for writing",numstatsfile.c_str());
00172
00173 ofstream symstats(symstatsfile.c_str());
00174
if(!symstats)
00175
PLERROR(
"could not open file %s for writing",symstatsfile.c_str());
00176
00177 numstats.precision(8);
00178 symstats.precision(8);
00179
00180
for(
unsigned int j=0; j<
fieldstat.size(); j++)
00181 {
00182
FieldStat& s =
fieldstat[j];
00183 numstats <<
fieldname(j) <<
' ' << s.
nonmissing() <<
' ' << s.
missing() <<
' '
00184 << s.
mean() <<
' ' << s.
stddev() <<
' ' << s.
min() <<
' ' << s.
max() <<
endl;
00185
00186 symstats << fieldname(j) <<
' ';
00187 symstats << s.
nsymbols() <<
" ";
00188 map<string,int>::iterator it;
00189
for(it = s.
symbolcount.begin(); it!= s.
symbolcount.end(); ++it)
00190 symstats << it->first <<
' ' << it->second <<
" ";
00191 symstats <<
endl;
00192 }
00193 }
00194
00195 void SDBWithStats::loadStats()
00196 {
00197
forgetStats();
00198
string numstatsfile =
getPath()+
getName()+
".stats";
00199
string symstatsfile =
getPath()+
getName()+
".symbols";
00200 ifstream numstats(numstatsfile.c_str());
00201
if(!numstats)
00202
PLERROR(
"could not open file %s for reading",numstatsfile.c_str());
00203
00204 ifstream symstats(symstatsfile.c_str());
00205
if(!symstats)
00206
PLERROR(
"could not open file %s for reading",symstatsfile.c_str());
00207
00208
for(
unsigned int j=0; j<
fieldstat.size(); j++)
00209 {
00210
string str_nonmissing_,str_missing_,str_mean_,str_stddev_,str_min_,str_max_;
00211
FieldStat& s =
fieldstat[j];
00212
string name;
00213 numstats >> name;
00214
00215
00216
if(name!=
fieldname(j))
00217
PLERROR(
"Row number %d of file %s does not correpond to field number %d: %s",j,numstatsfile.c_str(),j,
fieldname(j).c_str());
00218
00219
00220 numstats >> str_nonmissing_ >> str_missing_ >> str_mean_ >> str_stddev_ >> str_min_ >> str_max_;
00221
00222
if(str_mean_==
"nan")str_mean_=
"";
00223
if(str_stddev_==
"nan")str_stddev_=
"";
00224
if(str_min_==
"nan")str_min_=
"";
00225
if(str_max_==
"nan")str_max_=
"";
00226
00227 s.
nonmissing_ =
toint(str_nonmissing_);
00228 s.
missing_ =
toint(str_missing_);
00229 s.
mean_ =
tofloat(str_mean_);
00230 s.
stddev_ =
tofloat(str_stddev_);
00231 s.
min_ =
tofloat(str_min_);
00232 s.
max_ =
tofloat(str_max_);
00233
00234
00235
00236 symstats >> name;
00237
if(name!=fieldname(j))
00238
PLERROR(
"Row number %d of file %s does not correpond to field number %d: %s",j,symstatsfile.c_str(),j,fieldname(j).c_str());
00239
00240
int nsymbols;
00241 symstats >> nsymbols;
00242
string sym;
00243
int symcount;
00244
for(
int k=0;
k<nsymbols;
k++)
00245 {
00246 symstats >> sym >> symcount;
00247 s.
symbolcount[sym] = symcount;
00248 s.
symbolid[sym]=
k;
00249 }
00250 }
00251
00252 }
00253
00254 FieldStat& SDBWithStats::getStat(
int i)
00255 {
00256
if(i<0 || i>=
width())
00257
PLERROR(
"Out of bounds");
00258
return fieldstat[i];
00259 }
00260
00261 const FieldStat& SDBWithStats::getStat(
int i)
const
00262
{
00263
if(i<0 || i>=
width())
00264
PLERROR(
"Out of bounds");
00265
return fieldstat[i];
00266 }
00267
00268 FieldStat& SDBWithStats::getStat(
const string& fieldname)
00269 {
00270
int pos = indexOfField(fieldname);
00271
if(pos<0)
00272
PLERROR(
"No field named %s",fieldname.c_str());
00273
return getStat(pos);
00274 }
00275
00276 const FieldStat& SDBWithStats::getStat(
const string& fieldname)
const
00277
{
00278
int pos = indexOfField(fieldname);
00279
if(pos<0)
00280
PLERROR(
"No field named %s",fieldname.c_str());
00281
return getStat(pos);
00282 }
00283
00284
00285 }