Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

StatsCollector.cc

Go to the documentation of this file.
00001 // -*- C++ -*- 00002 00003 // PLearn (A C++ Machine Learning Library) 00004 // Copyright (C) 2001,2002 Pascal Vincent 00005 // 00006 00007 // Redistribution and use in source and binary forms, with or without 00008 // modification, are permitted provided that the following conditions are met: 00009 // 00010 // 1. Redistributions of source code must retain the above copyright 00011 // notice, this list of conditions and the following disclaimer. 00012 // 00013 // 2. Redistributions in binary form must reproduce the above copyright 00014 // notice, this list of conditions and the following disclaimer in the 00015 // documentation and/or other materials provided with the distribution. 00016 // 00017 // 3. The name of the authors may not be used to endorse or promote 00018 // products derived from this software without specific prior written 00019 // permission. 00020 // 00021 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00022 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00023 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00024 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00025 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00026 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00027 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00028 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00029 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00030 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00031 // 00032 // This file is part of the PLearn library. For more information on the PLearn 00033 // library, go to the PLearn Web site at www.plearn.org 00034 00035 00036 00037 /* ******************************************************* 00038 * $Id: StatsCollector.cc,v 1.37 2004/08/11 12:22:44 tihocan Exp $ 00039 * This file is part of the PLearn library. 00040 ******************************************************* */ 00041 00042 #include "StatsCollector.h" 00043 #include "TMat_maths.h" 00044 00045 namespace PLearn { 00046 using namespace std; 00047 00048 PLEARN_IMPLEMENT_OBJECT( 00049 StatsCollector, 00050 "Collects basic statistics", 00051 "A StatsCollector allows to compute basic global statistics for a series of numbers,\n" 00052 "as well as statistics within automatically determined ranges.\n" 00053 "The first maxnvalues encountered values will be used as reference points to define\n" 00054 "the ranges, so to get reasonable results, your sequence should be iid, and NOT sorted!\n" 00055 "\n" 00056 "The following statistics are available:" 00057 " - E Sample mean\n" 00058 " - V Sample variance\n" 00059 " - STDDEV Sample standard deviation\n" 00060 " - STDERROR Standard error of the mean\n" 00061 " - MIN Minimum value\n" 00062 " - MAX Maximum value\n" 00063 " - SUM Sum of observations \n" 00064 " - SUMSQ Sum of squares\n" 00065 " - FIRST First observation\n" 00066 " - LAST Last observation\n" 00067 " - N Total number of observations\n" 00068 " - NMISSING Number of missing observations\n" 00069 " - NNONMISSING Number of non-missing observations\n" 00070 " - SHARPERATIO Mean divided by standard deviation\n"); 00071 00072 00073 StatsCollector::StatsCollector(int the_maxnvalues) 00074 : maxnvalues(the_maxnvalues), 00075 nmissing_(0.), nnonmissing_(0.), 00076 sum_(0.), sumsquare_(0.), 00077 min_(MISSING_VALUE), max_(MISSING_VALUE), 00078 first_(MISSING_VALUE), last_(MISSING_VALUE) 00079 { 00080 build_(); 00081 } 00082 00083 int sortIdComparator(const void* i1, const void* i2) 00084 { 00085 double d = ((PairRealSCCType*)i1)->first - ((PairRealSCCType*)i2)->first; 00086 return (d<0)?-1:(d==0?0:1); 00087 } 00088 00091 void StatsCollector::sortIds() 00092 { 00093 PairRealSCCType* allreals= new PairRealSCCType[counts.size()]; 00094 int i=0; 00095 for(map<real,StatsCollectorCounts>::iterator it = counts.begin();it!=counts.end();++it,i++) 00096 allreals[i]=make_pair(it->first,&(it->second)); 00097 qsort(allreals,counts.size(),sizeof(PairRealSCCType),sortIdComparator); 00098 for(int i=0;i<(int)counts.size();i++) 00099 allreals[i].second->id=i; 00100 delete allreals; 00101 } 00102 00103 void StatsCollector::declareOptions(OptionList& ol) 00104 { 00105 // buid options 00106 00107 declareOption(ol, "maxnvalues", &StatsCollector::maxnvalues, OptionBase::buildoption, 00108 "maximum number of different values defining ranges to keep track of in counts\n" 00109 "(if 0, we will only keep track of global statistics)"); 00110 00111 // learnt options 00112 declareOption(ol, "nmissing_", &StatsCollector::nmissing_, OptionBase::learntoption, 00113 "number of missing values"); 00114 declareOption(ol, "nnonmissing_", &StatsCollector::nnonmissing_, OptionBase::learntoption, 00115 "number of non missing value "); 00116 declareOption(ol, "sum_", &StatsCollector::sum_, OptionBase::learntoption, 00117 "sum of all values"); 00118 declareOption(ol, "sumsquare_", &StatsCollector::sumsquare_, OptionBase::learntoption, 00119 "sum of square of all values "); 00120 declareOption(ol, "min_", &StatsCollector::min_, OptionBase::learntoption, 00121 "the min"); 00122 declareOption(ol, "max_", &StatsCollector::max_, OptionBase::learntoption, 00123 "the max"); 00124 declareOption(ol, "first_", &StatsCollector::first_, OptionBase::learntoption, 00125 "first encountered observation"); 00126 declareOption(ol, "last_", &StatsCollector::last_, OptionBase::learntoption, 00127 "last encountered observation"); 00128 declareOption(ol, "counts", &StatsCollector::counts, OptionBase::learntoption, 00129 "will contain up to maxnvalues values and associated Counts\n" 00130 "as well as a last element which maps FLT_MAX, so that we don't miss anything\n" 00131 "(remains empty if maxnvalues=0)"); 00132 00133 // Now call the parent class' declareOptions 00134 inherited::declareOptions(ol); 00135 } 00136 00137 void StatsCollector::build_() 00138 { 00139 // make sure count.size==0. If not, the object must have been loaded, and FLT_MAX is an existing key 00140 // but rounded to some precision, and there would be 2 keys approx.= FLT_MAX 00141 if(maxnvalues>0 && counts.size()==0) 00142 counts[FLT_MAX] = StatsCollectorCounts(); 00143 } 00144 00145 void StatsCollector::build() 00146 { 00147 inherited::build(); 00148 build_(); 00149 } 00150 00151 void StatsCollector::forget() 00152 { 00153 nmissing_ = 0.; 00154 nnonmissing_ = 0.; 00155 sum_ = 0.; 00156 sumsquare_ = 0.; 00157 min_ = MISSING_VALUE; 00158 max_ = MISSING_VALUE; 00159 first_ = last_ = MISSING_VALUE; 00160 counts.clear(); 00161 build_(); 00162 } 00163 00164 void StatsCollector::update(real val, real weight) 00165 { 00166 if(is_missing(val)) 00167 nmissing_ += weight; 00168 else 00169 { 00170 //sum_ += val * weight; 00171 //sumsquare_ += val*val * weight; 00172 last_ = val; 00173 if(nnonmissing_==0) // first value encountered 00174 min_ = max_ = first_ = last_ = val; 00175 else if(val<min_) 00176 min_ = val; 00177 else if(val>max_) 00178 max_ = val; 00179 nnonmissing_ += weight; 00180 double sqval = (val-first_)*(val-first_); 00181 sum_ += (val-first_) * weight; 00182 sumsquare_ += sqval * weight; 00183 00184 if(maxnvalues>0) // also remembering statistics inside values ranges 00185 { 00186 map<real,StatsCollectorCounts>::iterator it; 00187 if(int(counts.size())<=maxnvalues) // Still remembering new unseen values 00188 { 00189 it = counts.find(val); 00190 if(it==counts.end()) 00191 counts[val].id=counts.size()-1; 00192 counts[val].n += weight; 00193 } 00194 else // We've filled up counts already 00195 { 00196 it = counts.lower_bound(val); 00197 if(it->first==val) // found the exact value 00198 it->second.n += weight; 00199 else // found the value just above val (possibly FLT_MAX) 00200 { 00201 it->second.nbelow += weight; 00202 it->second.sum += val * weight; 00203 it->second.sumsquare += val*val * weight; 00204 } 00205 } 00206 } 00207 } 00208 } 00209 00210 RealMapping StatsCollector::getBinMapping(double discrete_mincount, 00211 double continuous_mincount, 00212 real tolerance, 00213 TVec<double> * fcount) const 00214 { 00215 real mapto=0.; 00216 RealMapping mapping; 00217 mapping.setMappingForOther(-1); 00218 map<real,StatsCollectorCounts>::const_iterator it = counts.begin(); 00219 int nleft = (int)counts.size()-1; // loop on all but last 00220 00221 if(fcount) 00222 { 00223 (*fcount) = TVec<double>(); 00224 // ouch, assume discrete_mincount == continuous_mincount 00225 fcount->resize(0, int(2.*nnonmissing_ / discrete_mincount)); 00226 fcount->append(nmissing_); 00227 fcount->append(0); 00228 } 00229 00230 double count = 0, count2 = 0; 00231 real low = min_; 00232 real high = min_; 00233 bool low_has_been_appended = false; 00234 // ProgressBar pb("Computing PseudoQ Mapping...",counts.size()-1); 00235 00236 while(nleft--) 00237 { 00238 high = it->first; 00239 // pb(counts.size()-1-nleft); 00240 count += it->second.nbelow; 00241 count2 += it->second.nbelow; 00242 // cerr << "it->first:"<<it->first<<" nbelow:"<<it->second.nbelow<<" n:"<<it->second.n<<endl; 00243 if(count>=continuous_mincount) 00244 { 00245 // append continuous range 00246 mapping.addMapping( 00247 RealRange(low_has_been_appended?']':'[',low, high, '['), 00248 mapto++); 00249 if(fcount) 00250 fcount->append(count); 00251 low = high; 00252 low_has_been_appended = false; 00253 count = 0; 00254 00255 } 00256 00257 if(it->second.n >= discrete_mincount) 00258 { 00259 if(count>0) // then append the previous continuous range 00260 { 00261 mapping.addMapping(RealRange(low_has_been_appended?']':'[',low, high, '['), mapto++); 00262 if(fcount) 00263 fcount->append(count); 00264 count = 0; 00265 } 00266 // append discrete point 00267 mapping.addMapping(RealRange('[',high,high,']'), mapto++); 00268 if(fcount) 00269 fcount->append(it->second.n + count); 00270 count2+=it->second.n; 00271 count=0; 00272 low = high; 00273 low_has_been_appended = true; 00274 } 00275 else 00276 { 00277 count2+=it->second.n; 00278 count += it->second.n; 00279 } 00280 ++it; 00281 } 00282 00283 if(it->first<=max_) 00284 PLERROR("Bug in StatsCollector::getBinMapping expected last element of mapping to be FLT_MAX..."); 00285 00286 if (mapping.size() == 0) 00287 { 00288 PLWARNING("StatsCollector::getBinMapping: no mapping were created; probably a bug"); 00289 mapping.addMapping(RealRange('[',min_,max_,']'), 0); 00290 return mapping; 00291 } 00292 00293 // make sure we include max_ 00294 pair<RealRange, real> m = mapping.lastMapping(); 00295 00296 // cnt is the number of elements that would be in the last bin 00297 double cnt = nnonmissing_ - count2 + count; 00298 00299 // If the bin we're about to add is short of less then tolerance*100% of continuous_mincount elements, 00300 // OR if the last we added is a discrete point AND the max is not already in the last range, we append it 00301 if(m.first.high<max_) 00302 { 00303 if( ((real)cnt/(real)continuous_mincount)>(1.-tolerance) || 00304 (m.first.low == m.first.high)) 00305 { 00306 // don't join last bin with last-but-one bin 00307 mapping.addMapping(RealRange(m.first.rightbracket=='[' ? '[' : ']',m.first.high,max_,']'), 00308 mapto++); 00309 if(fcount) 00310 fcount->append(cnt); 00311 } 00312 else 00313 { 00314 // otherwise, we can join it with the previous 00315 mapping.removeMapping(m.first); 00316 mapping.addMapping(RealRange(m.first.leftbracket, m.first.low, max_, ']'), 00317 m.second); 00318 if(fcount) 00319 { 00320 double v = fcount->back(); 00321 fcount->pop_back(); 00322 fcount->append(v+cnt); 00323 } 00324 } 00325 } 00326 else if(m.first.high==max_) // make sure we have a closing bracket on the max_ 00327 { 00328 mapping.removeMapping(m.first); 00329 mapping.addMapping(RealRange(m.first.leftbracket, m.first.low, max_, ']'), 00330 m.second); 00331 } 00332 return mapping; 00333 } 00334 00335 00336 RealMapping StatsCollector::getAllValuesMapping(TVec<double> * fcount) const 00337 { 00338 return getAllValuesMapping(0,fcount); 00339 } 00340 00341 RealMapping StatsCollector::getAllValuesMapping(TVec<bool>* to_be_included, 00342 TVec<double>* fcount, bool ignore_other, 00343 real tolerance) const { 00344 RealMapping mapping; 00345 if (ignore_other) { 00346 mapping.keep_other_as_is = false; 00347 mapping.other_mapsto = -1; 00348 } 00349 int i = 0; 00350 int k = 0; 00351 if(fcount) 00352 { 00353 (*fcount) = TVec<double>(); 00354 fcount->resize(0,counts.size()+2); 00355 fcount->append(nmissing_); 00356 fcount->append(0); 00357 } 00358 00359 double count=0; 00360 00361 real epsilon = 0; 00362 if (tolerance > 0) { 00363 // Compute the expansion coefficient 'epsilon'. 00364 StatsCollector values_diff; 00365 for (map<real,StatsCollectorCounts>::const_iterator it = counts.begin(); 00366 size_t(i) < counts.size() - 2; i++) { 00367 real val1 = it->first; 00368 it++; 00369 real val2 = it->first; 00370 values_diff.update(val2 - val1); 00371 } 00372 // Mean of the difference between two consecutive values. 00373 real mean = values_diff.mean(); 00374 epsilon = tolerance * mean; 00375 if (epsilon < 0) { 00376 PLERROR("In StatsCollector::getAllValuesMapping - epsilon < 0, there must be something wrong"); 00377 } 00378 } 00379 00380 i = 0; 00381 00382 for(map<real,StatsCollectorCounts>::const_iterator it = counts.begin() ; 00383 size_t(i) < counts.size() - 1; ++it) 00384 { 00385 real low_val = it->first - epsilon; 00386 real up_val = it->first + epsilon; 00387 map<real,StatsCollectorCounts>::const_iterator itup = it; 00388 itup++; 00389 int j = i + 1; 00390 bool to_include = true; 00391 if (to_be_included) { 00392 to_include = (*to_be_included)[i]; 00393 } 00394 real count_in_range = it->second.n; 00395 if (tolerance > 0) { 00396 for (; itup != counts.end(); itup++) { 00397 if (itup->first - epsilon <= up_val) { 00398 // The next mapping needs to be merged with the current one. 00399 if (fcount) { 00400 PLWARNING("In StatsCollector::getAllValuesMapping - You are using fcount and some ranges are merged. " 00401 "This case has not been tested yet. Please remove this warning if it works fine."); 00402 } 00403 up_val = itup->first + epsilon; 00404 count_in_range += itup->second.n; 00405 if (to_be_included) { 00406 // As long as one of the merged mappings needs to be included, 00407 // we include the result of the merge. 00408 to_include = to_include || (*to_be_included)[j]; 00409 } 00410 j++; 00411 } else { 00412 // No merging. 00413 break; 00414 } 00415 } 00416 } 00417 // Because the last one won't be merged (even if all are merged, the one 00418 // with FLT_MAX won't). 00419 itup--; 00420 it = itup; 00421 i = j - 1; 00422 00423 if (to_include) { 00424 mapping.addMapping(RealRange('[',low_val,up_val,']'),k); 00425 k++; 00426 if(fcount) 00427 { 00428 count += count_in_range; 00429 fcount->append(count_in_range); 00430 } 00431 } 00432 i++; 00433 } 00434 00435 if(fcount) 00436 (*fcount)[1] = nnonmissing_ - count; 00437 return mapping; 00438 } 00439 00440 Mat StatsCollector::cdf(bool normalized) const 00441 { 00442 int l = 2*(int)counts.size(); 00443 00444 Mat xy(l+1,2); 00445 int i=0; 00446 double currentcount = 0; 00447 xy(i,0) = min_; 00448 xy(i++,1) = 0; 00449 map<real,StatsCollectorCounts>::const_iterator it = counts.begin(); 00450 map<real,StatsCollectorCounts>::const_iterator itend = counts.end(); 00451 for(; it!=itend; ++it) 00452 { 00453 real val = it->first; 00454 if(val>max_) 00455 val = max_; 00456 00457 currentcount += it->second.nbelow; 00458 xy(i,0) = val; 00459 xy(i++,1) = currentcount; 00460 00461 currentcount += it->second.n; 00462 xy(i,0) = val; 00463 xy(i++,1) = currentcount; 00464 } 00465 if(normalized) 00466 xy.column(1) /= real(nnonmissing_); 00467 00468 return xy; 00469 } 00470 00471 void StatsCollector::print(ostream& out) const 00472 { 00473 out << "# samples: " << n() << "\n"; 00474 out << "# missing: " << nmissing() << "\n"; 00475 out << "mean: " << mean() << "\n"; 00476 out << "stddev: " << stddev() << "\n"; 00477 out << "stderr: " << stderror() << "\n"; 00478 out << "min: " << min() << "\n"; 00479 out << "max: " << max() << "\n\n"; 00480 out << "first: " << first_obs() << "\n"; 00481 out << "last: " << last_obs() << "\n\n"; 00482 out << "counts size: " << counts.size() << "\n"; 00483 map<real,StatsCollectorCounts>::const_iterator it = counts.begin(); 00484 map<real,StatsCollectorCounts>::const_iterator itend = counts.end(); 00485 for(; it!=itend; ++it) 00486 { 00487 out << "value: " << it->first 00488 << " #equal:" << it->second.n 00489 << " #less:" << it->second.nbelow 00490 << " avg_of_less:" << it->second.sum/it->second.nbelow << endl; 00491 } 00492 } 00493 00494 void StatsCollector::oldwrite(ostream& out) const 00495 { 00496 writeHeader(out,"StatsCollector",0); 00497 writeField(out, "nmissing_", nmissing_); 00498 writeField(out, "nnonmissing_", nnonmissing_); 00499 writeField(out, "sum_", sum_); 00500 writeField(out, "sumsquare_", sumsquare_); 00501 writeField(out, "min_", min_); 00502 writeField(out, "max_", max_); 00503 writeField(out, "maxnvalues", maxnvalues); 00504 00505 writeFieldName(out, "counts"); 00506 PLearn::write(out, (int)counts.size()); 00507 writeNewline(out); 00508 map<real,StatsCollectorCounts>::const_iterator it = counts.begin(); 00509 map<real,StatsCollectorCounts>::const_iterator itend = counts.end(); 00510 for(; it!=itend; ++it) 00511 { 00512 PLearn::write(out, it->first); 00513 PLearn::write(out, it->second.n); 00514 PLearn::write(out, it->second.nbelow); 00515 PLearn::write(out, it->second.sum); 00516 PLearn::write(out, it->second.sumsquare); 00517 writeNewline(out); 00518 } 00519 writeFooter(out,"StatsCollector"); 00520 } 00521 00522 void StatsCollector::oldread(istream& in) 00523 { 00524 int version = readHeader(in,"StatsCollector"); 00525 if(version!=0) 00526 PLERROR("In StatsCollector::oldead don't know how to read this version"); 00527 readField(in, "nmissing_", nmissing_); 00528 readField(in, "nnonmissing_", nnonmissing_); 00529 readField(in, "sum_", sum_); 00530 readField(in, "sumsquare_", sumsquare_); 00531 readField(in, "min_", min_); 00532 readField(in, "max_", max_); 00533 readField(in, "maxnvalues", maxnvalues); 00534 00535 readFieldName(in, "counts", true); 00536 counts.clear(); 00537 int ncounts; 00538 PLearn::read(in, ncounts); 00539 readNewline(in); 00540 for(int i=0; i<ncounts; i++) 00541 { 00542 real value; 00543 StatsCollectorCounts c; 00544 PLearn::read(in, value); 00545 PLearn::read(in, c.n); 00546 PLearn::read(in, c.nbelow); 00547 PLearn::read(in, c.sum); 00548 PLearn::read(in, c.sumsquare); 00549 readNewline(in); 00550 counts[value] = c; 00551 } 00552 readFooter(in,"StatsCollector"); 00553 } 00554 00555 00559 real StatsCollector::getStat(const string& statname) const 00560 { 00561 typedef real (StatsCollector::*STATFUN)() const; 00562 static bool init = false; 00563 static map<string,STATFUN> statistics; 00564 if (!init) { 00565 statistics["E"] = STATFUN(&StatsCollector::mean); 00566 statistics["V"] = STATFUN(&StatsCollector::variance); 00567 statistics["STDDEV"] = STATFUN(&StatsCollector::stddev); 00568 statistics["STDERROR"] = STATFUN(&StatsCollector::stderror); 00569 statistics["MIN"] = STATFUN(&StatsCollector::min); 00570 statistics["MAX"] = STATFUN(&StatsCollector::max); 00571 statistics["SUM"] = STATFUN(&StatsCollector::sum); 00572 statistics["SUMSQ"] = STATFUN(&StatsCollector::sumsquare); 00573 statistics["FIRST"] = STATFUN(&StatsCollector::first_obs); 00574 statistics["LAST"] = STATFUN(&StatsCollector::last_obs); 00575 statistics["N"] = STATFUN(&StatsCollector::n); 00576 statistics["NMISSING"] = STATFUN(&StatsCollector::nmissing); 00577 statistics["NNONMISSING"] = STATFUN(&StatsCollector::nnonmissing); 00578 statistics["SHARPERATIO"] = STATFUN(&StatsCollector::sharperatio); 00579 init = true; 00580 } 00581 00582 map<string,STATFUN>::iterator fun = statistics.find(statname); 00583 if (fun == statistics.end()) 00584 PLERROR("In StatsCollector::getStat, invalid statname %s", 00585 statname.c_str()); 00586 else 00587 return (this->*(fun->second))(); 00588 return 0; 00589 } 00590 00591 00592 TVec<RealMapping> computeRanges(TVec<StatsCollector> stats, int discrete_mincount, int continuous_mincount) 00593 { 00594 TVec<RealMapping> ranges; 00595 int n = stats.length(); 00596 ranges.resize(n); 00597 for(int k=0; k<n; k++) 00598 ranges[k] = stats[k].getBinMapping(discrete_mincount, continuous_mincount); 00599 return ranges; 00600 } 00601 00602 } // end of namespace PLearn 00603 00604

Generated on Tue Aug 17 16:06:56 2004 for PLearn by doxygen 1.3.7