Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

SDBWithStats.cc

Go to the documentation of this file.
00001 // -*- C++ -*- 00002 00003 // PLearn (A C++ Machine Learning Library) 00004 // Copyright (C) 1998 Pascal Vincent 00005 // Copyright (C) 1999-2002 Pascal Vincent, Yoshua Bengio and University of Montreal 00006 // 00007 00008 // Redistribution and use in source and binary forms, with or without 00009 // modification, are permitted provided that the following conditions are met: 00010 // 00011 // 1. Redistributions of source code must retain the above copyright 00012 // notice, this list of conditions and the following disclaimer. 00013 // 00014 // 2. Redistributions in binary form must reproduce the above copyright 00015 // notice, this list of conditions and the following disclaimer in the 00016 // documentation and/or other materials provided with the distribution. 00017 // 00018 // 3. The name of the authors may not be used to endorse or promote 00019 // products derived from this software without specific prior written 00020 // permission. 00021 // 00022 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00023 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00024 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00025 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00026 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00027 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00028 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00029 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00030 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00031 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00032 // 00033 // This file is part of the PLearn library. For more information on the PLearn 00034 // library, go to the PLearn Web site at www.plearn.org 00035 00036 00037 00038 00039 /* ******************************************************* 00040 * $Id: SDBWithStats.cc,v 1.2 2004/02/20 21:11:43 chrish42 Exp $ 00041 * AUTHORS: Pascal Vincent 00042 * This file is part of the PLearn library. 00043 ******************************************************* */ 00044 00045 #include "SDBWithStats.h" 00046 00047 namespace PLearn { 00048 using namespace std; 00049 00050 int FieldStat::max_nsymbols = 400; 00051 00052 void FieldStat::updateString(const string& sym) 00053 { 00054 ++nonmissing_; 00055 if(nsymbols()<max_nsymbols) 00056 symbolcount[sym]++; 00057 } 00058 00059 void FieldStat::updateNumber(double d) 00060 { 00061 ++nonmissing_; 00062 if(nsymbols()<max_nsymbols) 00063 symbolcount[tostring(d)]++; 00064 sum_ += d; 00065 sumsquare_ += d*d; 00066 if(d<min_) 00067 min_ = d; 00068 if(d>max_) 00069 max_ = d; 00070 } 00071 00072 void FieldStat::clear() 00073 { 00074 nonmissing_ = 0; 00075 missing_ = 0; 00076 sum_ = 0; 00077 sumsquare_ = 0; 00078 min_ = FLT_MAX; 00079 max_ = -FLT_MAX; 00080 mean_ = 0; 00081 stddev_ = 0; 00082 symbolcount.clear(); 00083 symbolid.clear(); 00084 } 00085 00086 void FieldStat::finalize() 00087 { 00088 mean_ = sum_/nonmissing_; 00089 double meansquare_ = sumsquare_/nonmissing_; 00090 stddev_ = sqrt( meansquare_ - square(mean_) ); 00091 if(nsymbols()>=max_nsymbols) // too many different values: ignore it (not really symbolic) 00092 { 00093 symbolcount.clear(); 00094 symbolid.clear(); 00095 } 00096 } 00097 00098 SDBWithStats::SDBWithStats(string basename, string path, AccessType access, bool verbose) 00099 :SDB(basename,path,access,verbose) 00100 { 00101 fieldstat.resize(getSchema().size()); 00102 if(hasStats()) 00103 loadStats(); 00104 } 00105 00106 void SDBWithStats::forgetStats() 00107 { 00108 for(int j=0; j<width(); j++) 00109 getStat(j).clear(); 00110 } 00111 00112 void SDBWithStats::computeStats(unsigned int nrows) 00113 { 00114 forgetStats(); 00115 const Schema& sc = getSchema(); 00116 Row row(&sc); 00117 00118 for(SDB::RowNumber i=0; i<nrows;i++) 00119 { 00120 getInRow(i, row); 00121 Row::const_iterator it = row.begin(); 00122 if (nrows>100000 && i%100000==0) 00123 cout << "SDBWithStats::computeStats processing row " << i << " of " << nrows << endl; 00124 00125 for (int j=0; j<nfields(); ++j, ++it) 00126 { 00127 if(it.isMissing()) 00128 fieldstat[j].updateMissing(); 00129 else 00130 { 00131 switch(it.getFieldType()) 00132 { 00133 case StringType: 00134 case CharacterType: 00135 fieldstat[j].updateString(tostring(it)); 00136 break; 00137 case SignedCharType: 00138 case ShortType: 00139 case IntType: 00140 case FloatType: 00141 case DoubleType: 00142 case DateType: 00143 fieldstat[j].updateNumber(todouble(it)); 00144 break; 00145 default: 00146 PLERROR("Unknown field type"); 00147 } 00148 } 00149 } 00150 } 00151 cout << "boucle terminee" << endl; 00152 for (int j=0; j<nfields(); ++j) 00153 fieldstat[j].finalize(); 00154 cout << "fini computestats" << endl; 00155 } 00156 00157 bool SDBWithStats::hasStats() 00158 { 00159 string numstatsfile = getPath()+getName()+".stats"; 00160 string symstatsfile = getPath()+getName()+".symbols"; 00161 return file_exists(numstatsfile.c_str()) && file_exists(symstatsfile.c_str()); 00162 } 00163 00164 void SDBWithStats::saveStats() 00165 { 00166 string numstatsfile = getPath()+getName()+".stats"; 00167 string symstatsfile = getPath()+getName()+".symbols"; 00168 00169 ofstream numstats(numstatsfile.c_str()); 00170 if(!numstats) 00171 PLERROR("could not open file %s for writing",numstatsfile.c_str()); 00172 00173 ofstream symstats(symstatsfile.c_str()); 00174 if(!symstats) 00175 PLERROR("could not open file %s for writing",symstatsfile.c_str()); 00176 00177 numstats.precision(8); 00178 symstats.precision(8); 00179 00180 for(unsigned int j=0; j<fieldstat.size(); j++) 00181 { 00182 FieldStat& s = fieldstat[j]; 00183 numstats << fieldname(j) << ' ' << s.nonmissing() << ' ' << s.missing() << ' ' 00184 << s.mean() << ' ' << s.stddev() << ' ' << s.min() << ' ' << s.max() << endl; 00185 00186 symstats << fieldname(j) << ' '; 00187 symstats << s.nsymbols() << " "; 00188 map<string,int>::iterator it; 00189 for(it = s.symbolcount.begin(); it!= s.symbolcount.end(); ++it) 00190 symstats << it->first << ' ' << it->second << " "; 00191 symstats << endl; 00192 } 00193 } 00194 00195 void SDBWithStats::loadStats() 00196 { 00197 forgetStats(); 00198 string numstatsfile = getPath()+getName()+".stats"; 00199 string symstatsfile = getPath()+getName()+".symbols"; 00200 ifstream numstats(numstatsfile.c_str()); 00201 if(!numstats) 00202 PLERROR("could not open file %s for reading",numstatsfile.c_str()); 00203 00204 ifstream symstats(symstatsfile.c_str()); 00205 if(!symstats) 00206 PLERROR("could not open file %s for reading",symstatsfile.c_str()); 00207 00208 for(unsigned int j=0; j<fieldstat.size(); j++) 00209 { 00210 string str_nonmissing_,str_missing_,str_mean_,str_stddev_,str_min_,str_max_; 00211 FieldStat& s = fieldstat[j]; 00212 string name; 00213 numstats >> name; 00214 00215 //cout<<"field : "<<j<<" name:"<<name<<" fieldname(j):"<<fieldname(j)<<endl; 00216 if(name!=fieldname(j)) 00217 PLERROR("Row number %d of file %s does not correpond to field number %d: %s",j,numstatsfile.c_str(),j,fieldname(j).c_str()); 00218 00219 // **** we use strings as intermediate type to handle nans (julien) 00220 numstats >> str_nonmissing_ >> str_missing_ >> str_mean_ >> str_stddev_ >> str_min_ >> str_max_; 00221 00222 if(str_mean_=="nan")str_mean_=""; 00223 if(str_stddev_=="nan")str_stddev_=""; 00224 if(str_min_=="nan")str_min_=""; 00225 if(str_max_=="nan")str_max_=""; 00226 00227 s.nonmissing_ = toint(str_nonmissing_); 00228 s.missing_ = toint(str_missing_); 00229 s.mean_ = tofloat(str_mean_); 00230 s.stddev_ = tofloat(str_stddev_); 00231 s.min_ = tofloat(str_min_); 00232 s.max_ = tofloat(str_max_); 00233 00234 //cout <<" "<< s.nonmissing_ <<" "<< s.missing_ <<" "<< s.mean_ <<" "<< s.stddev_ <<" "<< s.min_ <<" "<< s.max_<<endl; 00235 00236 symstats >> name; 00237 if(name!=fieldname(j)) 00238 PLERROR("Row number %d of file %s does not correpond to field number %d: %s",j,symstatsfile.c_str(),j,fieldname(j).c_str()); 00239 00240 int nsymbols; 00241 symstats >> nsymbols; 00242 string sym; 00243 int symcount; 00244 for(int k=0; k<nsymbols; k++) 00245 { 00246 symstats >> sym >> symcount; 00247 s.symbolcount[sym] = symcount; 00248 s.symbolid[sym]=k; 00249 } 00250 } 00251 00252 } 00253 00254 FieldStat& SDBWithStats::getStat(int i) 00255 { 00256 if(i<0 || i>=width()) 00257 PLERROR("Out of bounds"); 00258 return fieldstat[i]; 00259 } 00260 00261 const FieldStat& SDBWithStats::getStat(int i) const 00262 { 00263 if(i<0 || i>=width()) 00264 PLERROR("Out of bounds"); 00265 return fieldstat[i]; 00266 } 00267 00268 FieldStat& SDBWithStats::getStat(const string& fieldname) 00269 { 00270 int pos = indexOfField(fieldname); 00271 if(pos<0) 00272 PLERROR("No field named %s",fieldname.c_str()); 00273 return getStat(pos); 00274 } 00275 00276 const FieldStat& SDBWithStats::getStat(const string& fieldname) const 00277 { 00278 int pos = indexOfField(fieldname); 00279 if(pos<0) 00280 PLERROR("No field named %s",fieldname.c_str()); 00281 return getStat(pos); 00282 } 00283 00284 00285 } // end of namespace PLearn

Generated on Tue Aug 17 16:04:41 2004 for PLearn by doxygen 1.3.7