00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
#include "StatsCollector.h"
00043
#include "TMat_maths.h"
00044
00045
namespace PLearn {
00046
using namespace std;
00047
00048
PLEARN_IMPLEMENT_OBJECT(
00049 StatsCollector,
00050
"Collects basic statistics",
00051
"A StatsCollector allows to compute basic global statistics for a series of numbers,\n"
00052
"as well as statistics within automatically determined ranges.\n"
00053
"The first maxnvalues encountered values will be used as reference points to define\n"
00054
"the ranges, so to get reasonable results, your sequence should be iid, and NOT sorted!\n"
00055
"\n"
00056
"The following statistics are available:"
00057
" - E Sample mean\n"
00058
" - V Sample variance\n"
00059
" - STDDEV Sample standard deviation\n"
00060
" - STDERROR Standard error of the mean\n"
00061
" - MIN Minimum value\n"
00062
" - MAX Maximum value\n"
00063
" - SUM Sum of observations \n"
00064
" - SUMSQ Sum of squares\n"
00065
" - FIRST First observation\n"
00066
" - LAST Last observation\n"
00067
" - N Total number of observations\n"
00068
" - NMISSING Number of missing observations\n"
00069
" - NNONMISSING Number of non-missing observations\n"
00070
" - SHARPERATIO Mean divided by standard deviation\n");
00071
00072
00073 StatsCollector::StatsCollector(
int the_maxnvalues)
00074 : maxnvalues(the_maxnvalues),
00075 nmissing_(0.), nnonmissing_(0.),
00076 sum_(0.), sumsquare_(0.),
00077 min_(
MISSING_VALUE), max_(
MISSING_VALUE),
00078 first_(
MISSING_VALUE), last_(
MISSING_VALUE)
00079 {
00080
build_();
00081 }
00082
00083 int sortIdComparator(
const void* i1,
const void* i2)
00084 {
00085
double d = ((
PairRealSCCType*)i1)->first - ((
PairRealSCCType*)i2)->first;
00086
return (d<0)?-1:(d==0?0:1);
00087 }
00088
00091 void StatsCollector::sortIds()
00092 {
00093
PairRealSCCType* allreals=
new PairRealSCCType[
counts.size()];
00094
int i=0;
00095
for(map<real,StatsCollectorCounts>::iterator it =
counts.begin();it!=
counts.end();++it,i++)
00096 allreals[i]=make_pair(it->first,&(it->second));
00097 qsort(allreals,
counts.size(),
sizeof(
PairRealSCCType),
sortIdComparator);
00098
for(
int i=0;i<(
int)
counts.size();i++)
00099 allreals[i].second->id=i;
00100
delete allreals;
00101 }
00102
00103 void StatsCollector::declareOptions(
OptionList& ol)
00104 {
00105
00106
00107
declareOption(ol,
"maxnvalues", &StatsCollector::maxnvalues, OptionBase::buildoption,
00108
"maximum number of different values defining ranges to keep track of in counts\n"
00109
"(if 0, we will only keep track of global statistics)");
00110
00111
00112
declareOption(ol,
"nmissing_", &StatsCollector::nmissing_, OptionBase::learntoption,
00113
"number of missing values");
00114
declareOption(ol,
"nnonmissing_", &StatsCollector::nnonmissing_, OptionBase::learntoption,
00115
"number of non missing value ");
00116
declareOption(ol,
"sum_", &StatsCollector::sum_, OptionBase::learntoption,
00117
"sum of all values");
00118
declareOption(ol,
"sumsquare_", &StatsCollector::sumsquare_, OptionBase::learntoption,
00119
"sum of square of all values ");
00120
declareOption(ol,
"min_", &StatsCollector::min_, OptionBase::learntoption,
00121
"the min");
00122
declareOption(ol,
"max_", &StatsCollector::max_, OptionBase::learntoption,
00123
"the max");
00124
declareOption(ol,
"first_", &StatsCollector::first_, OptionBase::learntoption,
00125
"first encountered observation");
00126
declareOption(ol,
"last_", &StatsCollector::last_, OptionBase::learntoption,
00127
"last encountered observation");
00128
declareOption(ol,
"counts", &StatsCollector::counts, OptionBase::learntoption,
00129
"will contain up to maxnvalues values and associated Counts\n"
00130
"as well as a last element which maps FLT_MAX, so that we don't miss anything\n"
00131
"(remains empty if maxnvalues=0)");
00132
00133
00134 inherited::declareOptions(ol);
00135 }
00136
00137 void StatsCollector::build_()
00138 {
00139
00140
00141
if(
maxnvalues>0 &&
counts.size()==0)
00142
counts[FLT_MAX] =
StatsCollectorCounts();
00143 }
00144
00145 void StatsCollector::build()
00146 {
00147 inherited::build();
00148
build_();
00149 }
00150
00151 void StatsCollector::forget()
00152 {
00153
nmissing_ = 0.;
00154
nnonmissing_ = 0.;
00155
sum_ = 0.;
00156
sumsquare_ = 0.;
00157
min_ =
MISSING_VALUE;
00158
max_ =
MISSING_VALUE;
00159
first_ =
last_ =
MISSING_VALUE;
00160
counts.clear();
00161
build_();
00162 }
00163
00164 void StatsCollector::update(
real val,
real weight)
00165 {
00166
if(
is_missing(
val))
00167
nmissing_ += weight;
00168
else
00169 {
00170
00171
00172
last_ =
val;
00173
if(
nnonmissing_==0)
00174
min_ =
max_ =
first_ =
last_ =
val;
00175
else if(
val<
min_)
00176
min_ =
val;
00177
else if(
val>max_)
00178 max_ =
val;
00179
nnonmissing_ += weight;
00180
double sqval = (
val-first_)*(
val-first_);
00181
sum_ += (
val-first_) * weight;
00182
sumsquare_ += sqval * weight;
00183
00184
if(
maxnvalues>0)
00185 {
00186 map<real,StatsCollectorCounts>::iterator it;
00187
if(
int(
counts.size())<=
maxnvalues)
00188 {
00189 it =
counts.find(
val);
00190
if(it==
counts.end())
00191
counts[
val].id=
counts.size()-1;
00192
counts[
val].n += weight;
00193 }
00194
else
00195 {
00196 it =
counts.lower_bound(
val);
00197
if(it->first==
val)
00198 it->second.n += weight;
00199
else
00200 {
00201 it->second.nbelow += weight;
00202 it->second.sum +=
val * weight;
00203 it->second.sumsquare +=
val*
val * weight;
00204 }
00205 }
00206 }
00207 }
00208 }
00209
00210 RealMapping StatsCollector::getBinMapping(
double discrete_mincount,
00211
double continuous_mincount,
00212
real tolerance,
00213
TVec<double> * fcount)
const
00214
{
00215
real mapto=0.;
00216
RealMapping mapping;
00217 mapping.
setMappingForOther(-1);
00218 map<real,StatsCollectorCounts>::const_iterator it =
counts.begin();
00219
int nleft = (
int)
counts.size()-1;
00220
00221
if(fcount)
00222 {
00223 (*fcount) =
TVec<double>();
00224
00225 fcount->
resize(0,
int(2.*
nnonmissing_ / discrete_mincount));
00226 fcount->
append(
nmissing_);
00227 fcount->
append(0);
00228 }
00229
00230
double count = 0, count2 = 0;
00231
real low =
min_;
00232
real high = min_;
00233
bool low_has_been_appended =
false;
00234
00235
00236
while(nleft--)
00237 {
00238 high = it->first;
00239
00240
count += it->second.nbelow;
00241 count2 += it->second.nbelow;
00242
00243
if(
count>=continuous_mincount)
00244 {
00245
00246 mapping.
addMapping(
00247
RealRange(low_has_been_appended?
']':
'[',low, high,
'['),
00248 mapto++);
00249
if(fcount)
00250 fcount->
append(
count);
00251 low = high;
00252 low_has_been_appended =
false;
00253
count = 0;
00254
00255 }
00256
00257
if(it->second.n >= discrete_mincount)
00258 {
00259
if(
count>0)
00260 {
00261 mapping.
addMapping(
RealRange(low_has_been_appended?
']':
'[',low, high,
'['), mapto++);
00262
if(fcount)
00263 fcount->
append(
count);
00264
count = 0;
00265 }
00266
00267 mapping.
addMapping(
RealRange(
'[',high,high,
']'), mapto++);
00268
if(fcount)
00269 fcount->
append(it->second.n +
count);
00270 count2+=it->second.n;
00271
count=0;
00272 low = high;
00273 low_has_been_appended =
true;
00274 }
00275
else
00276 {
00277 count2+=it->second.n;
00278
count += it->second.n;
00279 }
00280 ++it;
00281 }
00282
00283
if(it->first<=
max_)
00284
PLERROR(
"Bug in StatsCollector::getBinMapping expected last element of mapping to be FLT_MAX...");
00285
00286
if (mapping.
size() == 0)
00287 {
00288
PLWARNING(
"StatsCollector::getBinMapping: no mapping were created; probably a bug");
00289 mapping.
addMapping(
RealRange(
'[',min_,
max_,
']'), 0);
00290
return mapping;
00291 }
00292
00293
00294 pair<RealRange, real> m = mapping.
lastMapping();
00295
00296
00297
double cnt =
nnonmissing_ - count2 +
count;
00298
00299
00300
00301
if(m.first.high<
max_)
00302 {
00303
if( ((
real)cnt/(
real)continuous_mincount)>(1.-tolerance) ||
00304 (m.first.low == m.first.high))
00305 {
00306
00307 mapping.
addMapping(
RealRange(m.first.rightbracket==
'[' ?
'[' :
']',m.first.high,
max_,
']'),
00308 mapto++);
00309
if(fcount)
00310 fcount->
append(cnt);
00311 }
00312
else
00313 {
00314
00315 mapping.
removeMapping(m.first);
00316 mapping.
addMapping(
RealRange(m.first.leftbracket, m.first.low,
max_,
']'),
00317 m.second);
00318
if(fcount)
00319 {
00320
double v = fcount->
back();
00321 fcount->
pop_back();
00322 fcount->
append(v+cnt);
00323 }
00324 }
00325 }
00326
else if(m.first.high==
max_)
00327 {
00328 mapping.
removeMapping(m.first);
00329 mapping.
addMapping(
RealRange(m.first.leftbracket, m.first.low,
max_,
']'),
00330 m.second);
00331 }
00332
return mapping;
00333 }
00334
00335
00336 RealMapping StatsCollector::getAllValuesMapping(
TVec<double> * fcount)
const
00337
{
00338
return getAllValuesMapping(0,fcount);
00339 }
00340
00341 RealMapping StatsCollector::getAllValuesMapping(
TVec<bool>* to_be_included,
00342
TVec<double>* fcount,
bool ignore_other,
00343
real tolerance)
const {
00344
RealMapping mapping;
00345
if (ignore_other) {
00346 mapping.
keep_other_as_is =
false;
00347 mapping.
other_mapsto = -1;
00348 }
00349
int i = 0;
00350
int k = 0;
00351
if(fcount)
00352 {
00353 (*fcount) =
TVec<double>();
00354 fcount->
resize(0,
counts.size()+2);
00355 fcount->
append(
nmissing_);
00356 fcount->
append(0);
00357 }
00358
00359
double count=0;
00360
00361
real epsilon = 0;
00362
if (tolerance > 0) {
00363
00364
StatsCollector values_diff;
00365
for (map<real,StatsCollectorCounts>::const_iterator it =
counts.begin();
00366 size_t(i) <
counts.size() - 2; i++) {
00367
real val1 = it->first;
00368 it++;
00369
real val2 = it->first;
00370 values_diff.
update(val2 - val1);
00371 }
00372
00373
real mean = values_diff.
mean();
00374 epsilon = tolerance *
mean;
00375
if (epsilon < 0) {
00376
PLERROR(
"In StatsCollector::getAllValuesMapping - epsilon < 0, there must be something wrong");
00377 }
00378 }
00379
00380 i = 0;
00381
00382
for(map<real,StatsCollectorCounts>::const_iterator it =
counts.begin() ;
00383 size_t(i) <
counts.size() - 1; ++it)
00384 {
00385
real low_val = it->first - epsilon;
00386
real up_val = it->first + epsilon;
00387 map<real,StatsCollectorCounts>::const_iterator itup = it;
00388 itup++;
00389
int j = i + 1;
00390
bool to_include =
true;
00391
if (to_be_included) {
00392 to_include = (*to_be_included)[i];
00393 }
00394
real count_in_range = it->second.n;
00395
if (tolerance > 0) {
00396
for (; itup !=
counts.end(); itup++) {
00397
if (itup->first - epsilon <= up_val) {
00398
00399
if (fcount) {
00400
PLWARNING(
"In StatsCollector::getAllValuesMapping - You are using fcount and some ranges are merged. "
00401
"This case has not been tested yet. Please remove this warning if it works fine.");
00402 }
00403 up_val = itup->first + epsilon;
00404 count_in_range += itup->second.n;
00405
if (to_be_included) {
00406
00407
00408 to_include = to_include || (*to_be_included)[j];
00409 }
00410 j++;
00411 }
else {
00412
00413
break;
00414 }
00415 }
00416 }
00417
00418
00419 itup--;
00420 it = itup;
00421 i = j - 1;
00422
00423
if (to_include) {
00424 mapping.
addMapping(
RealRange(
'[',low_val,up_val,
']'),
k);
00425
k++;
00426
if(fcount)
00427 {
00428
count += count_in_range;
00429 fcount->
append(count_in_range);
00430 }
00431 }
00432 i++;
00433 }
00434
00435
if(fcount)
00436 (*fcount)[1] =
nnonmissing_ -
count;
00437
return mapping;
00438 }
00439
00440 Mat StatsCollector::cdf(
bool normalized)
const
00441
{
00442
int l = 2*(
int)
counts.size();
00443
00444
Mat xy(l+1,2);
00445
int i=0;
00446
double currentcount = 0;
00447 xy(i,0) =
min_;
00448 xy(i++,1) = 0;
00449 map<real,StatsCollectorCounts>::const_iterator it =
counts.begin();
00450 map<real,StatsCollectorCounts>::const_iterator itend =
counts.end();
00451
for(; it!=itend; ++it)
00452 {
00453
real val = it->first;
00454
if(
val>
max_)
00455
val =
max_;
00456
00457 currentcount += it->second.nbelow;
00458 xy(i,0) =
val;
00459 xy(i++,1) = currentcount;
00460
00461 currentcount += it->second.n;
00462 xy(i,0) =
val;
00463 xy(i++,1) = currentcount;
00464 }
00465
if(normalized)
00466 xy.
column(1) /=
real(
nnonmissing_);
00467
00468
return xy;
00469 }
00470
00471 void StatsCollector::print(ostream& out)
const
00472
{
00473 out <<
"# samples: " <<
n() <<
"\n";
00474 out <<
"# missing: " <<
nmissing() <<
"\n";
00475 out <<
"mean: " <<
mean() <<
"\n";
00476 out <<
"stddev: " <<
stddev() <<
"\n";
00477 out <<
"stderr: " <<
stderror() <<
"\n";
00478 out <<
"min: " <<
min() <<
"\n";
00479 out <<
"max: " <<
max() <<
"\n\n";
00480 out <<
"first: " <<
first_obs() <<
"\n";
00481 out <<
"last: " <<
last_obs() <<
"\n\n";
00482 out <<
"counts size: " <<
counts.size() <<
"\n";
00483 map<real,StatsCollectorCounts>::const_iterator it =
counts.begin();
00484 map<real,StatsCollectorCounts>::const_iterator itend =
counts.end();
00485
for(; it!=itend; ++it)
00486 {
00487 out <<
"value: " << it->first
00488 <<
" #equal:" << it->second.n
00489 <<
" #less:" << it->second.nbelow
00490 <<
" avg_of_less:" << it->second.sum/it->second.nbelow <<
endl;
00491 }
00492 }
00493
00494 void StatsCollector::oldwrite(ostream& out)
const
00495
{
00496
writeHeader(out,
"StatsCollector",0);
00497
writeField(out,
"nmissing_",
nmissing_);
00498
writeField(out,
"nnonmissing_",
nnonmissing_);
00499
writeField(out,
"sum_",
sum_);
00500
writeField(out,
"sumsquare_",
sumsquare_);
00501
writeField(out,
"min_",
min_);
00502
writeField(out,
"max_",
max_);
00503
writeField(out,
"maxnvalues",
maxnvalues);
00504
00505
writeFieldName(out,
"counts");
00506
PLearn::write(out, (
int)
counts.size());
00507
writeNewline(out);
00508 map<real,StatsCollectorCounts>::const_iterator it =
counts.begin();
00509 map<real,StatsCollectorCounts>::const_iterator itend =
counts.end();
00510
for(; it!=itend; ++it)
00511 {
00512
PLearn::write(out, it->first);
00513
PLearn::write(out, it->second.n);
00514
PLearn::write(out, it->second.nbelow);
00515
PLearn::write(out, it->second.sum);
00516
PLearn::write(out, it->second.sumsquare);
00517
writeNewline(out);
00518 }
00519
writeFooter(out,
"StatsCollector");
00520 }
00521
00522 void StatsCollector::oldread(istream& in)
00523 {
00524
int version =
readHeader(in,
"StatsCollector");
00525
if(version!=0)
00526
PLERROR(
"In StatsCollector::oldead don't know how to read this version");
00527
readField(in,
"nmissing_",
nmissing_);
00528
readField(in,
"nnonmissing_",
nnonmissing_);
00529
readField(in,
"sum_",
sum_);
00530
readField(in,
"sumsquare_",
sumsquare_);
00531
readField(in,
"min_",
min_);
00532
readField(in,
"max_",
max_);
00533
readField(in,
"maxnvalues",
maxnvalues);
00534
00535
readFieldName(in,
"counts",
true);
00536
counts.clear();
00537
int ncounts;
00538
PLearn::read(in, ncounts);
00539
readNewline(in);
00540
for(
int i=0; i<ncounts; i++)
00541 {
00542
real value;
00543
StatsCollectorCounts c;
00544
PLearn::read(in, value);
00545
PLearn::read(in, c.
n);
00546
PLearn::read(in, c.
nbelow);
00547
PLearn::read(in, c.
sum);
00548
PLearn::read(in, c.
sumsquare);
00549
readNewline(in);
00550
counts[value] = c;
00551 }
00552
readFooter(in,
"StatsCollector");
00553 }
00554
00555
00559 real StatsCollector::getStat(
const string& statname)
const
00560
{
00561
typedef real (StatsCollector::*STATFUN)()
const;
00562
static bool init =
false;
00563
static map<string,STATFUN> statistics;
00564
if (!init) {
00565 statistics[
"E"] = STATFUN(&StatsCollector::mean);
00566 statistics[
"V"] = STATFUN(&StatsCollector::variance);
00567 statistics[
"STDDEV"] = STATFUN(&StatsCollector::stddev);
00568 statistics[
"STDERROR"] = STATFUN(&StatsCollector::stderror);
00569 statistics[
"MIN"] = STATFUN(&StatsCollector::min);
00570 statistics[
"MAX"] = STATFUN(&StatsCollector::max);
00571 statistics[
"SUM"] = STATFUN(&StatsCollector::sum);
00572 statistics[
"SUMSQ"] = STATFUN(&StatsCollector::sumsquare);
00573 statistics[
"FIRST"] = STATFUN(&StatsCollector::first_obs);
00574 statistics[
"LAST"] = STATFUN(&StatsCollector::last_obs);
00575 statistics[
"N"] = STATFUN(&StatsCollector::n);
00576 statistics[
"NMISSING"] = STATFUN(&StatsCollector::nmissing);
00577 statistics[
"NNONMISSING"] = STATFUN(&StatsCollector::nnonmissing);
00578 statistics[
"SHARPERATIO"] = STATFUN(&StatsCollector::sharperatio);
00579 init =
true;
00580 }
00581
00582 map<string,STATFUN>::iterator fun = statistics.find(statname);
00583
if (fun == statistics.end())
00584
PLERROR(
"In StatsCollector::getStat, invalid statname %s",
00585 statname.c_str());
00586
else
00587
return (this->*(fun->second))();
00588
return 0;
00589 }
00590
00591
00592 TVec<RealMapping> computeRanges(
TVec<StatsCollector> stats,
int discrete_mincount,
int continuous_mincount)
00593 {
00594
TVec<RealMapping> ranges;
00595
int n = stats.
length();
00596 ranges.
resize(n);
00597
for(
int k=0;
k<n;
k++)
00598 ranges[
k] = stats[
k].getBinMapping(discrete_mincount, continuous_mincount);
00599
return ranges;
00600 }
00601
00602 }
00603
00604