Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

HistogramDistribution.cc

Go to the documentation of this file.
00001 // -*- C++ -*- 00002 00003 // HistogramDistribution.cc 00004 // 00005 // Copyright (C) 2002 Yoshua Bengio, Pascal Vincent, Xavier Saint-Mleux 00006 // 00007 // Redistribution and use in source and binary forms, with or without 00008 // modification, are permitted provided that the following conditions are met: 00009 // 00010 // 1. Redistributions of source code must retain the above copyright 00011 // notice, this list of conditions and the following disclaimer. 00012 // 00013 // 2. Redistributions in binary form must reproduce the above copyright 00014 // notice, this list of conditions and the following disclaimer in the 00015 // documentation and/or other materials provided with the distribution. 00016 // 00017 // 3. The name of the authors may not be used to endorse or promote 00018 // products derived from this software without specific prior written 00019 // permission. 00020 // 00021 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00022 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00023 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00024 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00025 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00026 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00027 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00028 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00029 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00030 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00031 // 00032 // This file is part of the PLearn library. For more information on the PLearn 00033 // library, go to the PLearn Web site at www.plearn.org 00034 00035 /* ******************************************************* 00036 * $Id: HistogramDistribution.cc,v 1.12 2004/02/20 21:14:46 chrish42 Exp $ 00037 ******************************************************* */ 00038 00040 #include "HistogramDistribution.h" 00041 //#include <algorithm> 00042 //#include <cmath> 00043 00044 namespace PLearn { 00045 using namespace std; 00046 00047 HistogramDistribution::HistogramDistribution() {} 00048 00049 HistogramDistribution::HistogramDistribution(VMat data, PP<Binner> binner_, 00050 PP<Smoother> smoother_) 00051 :bin_positions(data.length()+1), bin_density(data.length()), survival_values(data.length()), 00052 binner(binner_), smoother(smoother_) 00053 { 00054 setTrainingSet(data); 00055 train(); 00056 } 00057 00058 PLEARN_IMPLEMENT_OBJECT(HistogramDistribution, 00059 "Represents and possibly learns (using a smoother) a univariate distribution as a histogram.", 00060 "This class represents a univariate distribution with a set of bins and their densities\n" 00061 "The bins can be fixed or learned by a Binner object, and the densities\n" 00062 "can be learned from a training set. The empirical densities in the bins can also\n" 00063 "be smoothed with a Smoother (which is a general purpose univariate function\n" 00064 "smoothing mechanism. If the data is not univariate, then only the LAST column\n" 00065 "is considered. The smoother can either smooth the density or the survival fn.\n"); 00066 00067 void HistogramDistribution::declareOptions(OptionList& ol) 00068 { 00069 declareOption(ol, "bin_positions", &HistogramDistribution::bin_positions, OptionBase::learntoption, 00070 "The n+1 positions that define n bins. There is one more bin position " 00071 "than number of bins, all the bins are supposed adjacent."); 00072 00073 declareOption(ol, "bin_density", &HistogramDistribution::bin_density, OptionBase::learntoption, 00074 "Density of the distribution for each bin. The density is supposed " 00075 "constant within each bin:\n" 00076 "\t p(x) = bin_density[i] if bin_positions[i] < x <= bin_positions[i+1]."); 00077 00078 declareOption(ol, "survival_values", &HistogramDistribution::survival_values, OptionBase::learntoption, 00079 "Redundant with density is the pre-computed survival function."); 00080 00081 declareOption(ol, "binner", &HistogramDistribution::binner, OptionBase::buildoption, 00082 "Used to do binning at training time (although a fixed binning scheme can be\n" 00083 "obtained by using a ManualBinner.B)"); 00084 00085 declareOption(ol, "smoother", &HistogramDistribution::smoother, OptionBase::buildoption, 00086 "Used to smooth learned density (or survival) at train time, after the empirical\n" 00087 "frequencies of each bin have been collected\n"); 00088 00089 declareOption(ol, "smooth_density_instead_of_survival_fn", 00090 &HistogramDistribution::smooth_density_instead_of_survival_fn, OptionBase::buildoption, 00091 "whether to smooth the density or the survival function, with the smoother\n"); 00092 00093 // Now call the parent class' declareOptions 00094 inherited::declareOptions(ol); 00095 } 00096 00097 void HistogramDistribution::build_() 00098 { 00099 } 00100 00101 // ### Nothing to add here, simply calls build_ 00102 void HistogramDistribution::build() 00103 { 00104 inherited::build(); 00105 build_(); 00106 } 00107 00108 void HistogramDistribution::train() 00109 { 00110 00111 /* 00112 - prend la distri empirique 00113 | trie les points 00114 | merge les bins (possiblement sous contraintes) 00115 | - points de coupure predefinis (option include_cutpoints) ManualBinner 00116 | - largeur des bins > a une valeur minimale 00117 | - bins contenir un minimum de points 00118 Binner 00119 00120 Smoother 00121 (recalcule la densite) 00122 00123 calculer survival_values 00124 */ 00125 00126 if(train_set->width() != inputsize()+targetsize()) 00127 PLERROR("In HistogramDistribution::train(VMat training_set) training_set->width() != inputsize()+targetsize()"); 00128 if(train_set->width() != 1) 00129 PLERROR("In HistogramDistribution::train() train_set->width() must be 1 (column vec.)"); 00130 if(binner == 0) 00131 PLERROR("In HistogramDistribution::train() Can't train without a Binner."); 00132 00133 Vec data(train_set.length()); 00134 data << train_set.getColumn(train_set.width()-1); 00135 00136 PP<RealMapping> binning= binner->getBinning(train_set); 00137 binning->setMappingForOther(0.0); 00138 binning->transform(data); 00139 00140 bin_positions= binning->getCutPoints(); 00141 bin_density.resize(bin_positions.length()-1); 00142 survival_values.resize(bin_positions.length()-1); 00143 00144 for(int i= 0; i < data.length(); ++i) 00145 ++survival_values[static_cast<int>(data[i])]; 00146 for(int i= survival_values.length()-2; i >= 0; --i) 00147 survival_values[i]+= survival_values[i+1]; 00148 for(int i= survival_values.length()-1; i >= 0; --i) 00149 survival_values[i]/= survival_values[0]; 00150 00151 if(smoother) 00152 { 00153 if (smooth_density_instead_of_survival_fn) 00154 { 00155 calc_density_from_survival(); 00156 Vec df(bin_density.length()); 00157 df << bin_density; 00158 smoother->smooth(df, bin_density, bin_positions, bin_positions); 00159 calc_survival_from_density(); 00160 } 00161 else 00162 { 00163 Vec sv(survival_values.length()); 00164 sv << survival_values; 00165 smoother->smooth(sv, survival_values, bin_positions, bin_positions); 00166 calc_density_from_survival(); 00167 } 00168 } 00169 else 00170 calc_density_from_survival(); 00171 } 00172 00173 void HistogramDistribution::computeOutput(const Vec& input, Vec& output) 00174 { 00175 if(input.size() != 1 || output.size() != 1) 00176 PLERROR("In HistogramDistribution::use implemented only for reals; i.e. input.size()=output.size()=1. " 00177 "Got input.size()=%d and output.size()=%d", input.size(), output.size()); 00178 // outputs_def: 'l'->log_density, 'd' -> density, 'c' -> cdf, 's' -> survival_fn, 'e' -> expectation, 'v' -> variance 00179 if(outputs_def == "l") output[0]= log_density(input); 00180 else if(outputs_def == "d") output[0]= density(input); 00181 else if(outputs_def == "c") output[0]= cdf(input); 00182 else if(outputs_def == "s") output[0]= survival_fn(input); 00183 else if(outputs_def == "e") { Vec mu(1); expectation(mu); output[0]= mu[0]; } 00184 else if(outputs_def == "v") { Mat m(1,1); variance(m); output[0]= m(0,0); } 00185 else PLERROR("In HistogramDistribution::use unknown value for outputs_def= \"%s\"", outputs_def.c_str()); 00186 } 00187 00188 void HistogramDistribution::makeDeepCopyFromShallowCopy(map<const void*, void*>& copies) 00189 { 00190 PLearner::makeDeepCopyFromShallowCopy(copies); 00191 00192 deepCopyField(bin_positions, copies); 00193 deepCopyField(bin_density, copies); 00194 deepCopyField(survival_values, copies); 00195 deepCopyField(binner, copies); 00196 deepCopyField(smoother, copies); 00197 } 00198 00199 double HistogramDistribution::log_density(const Vec& x) const 00200 { 00201 return log(density(x)); 00202 } 00203 00204 00205 double HistogramDistribution::density(const Vec& x) const 00206 { 00207 if(x.size() != 1) 00208 PLERROR("HistogramDistribution::density implemented only for univariate data (vec size == 1)."); 00209 return bin_density[find_bin(x[0])]; 00210 } 00211 00212 00213 double HistogramDistribution::survival_fn(const Vec& x) const 00214 { 00215 if(x.size() != 1) 00216 PLERROR("HistogramDistribution::survival_fn implemented only for univariate data (vec size == 1)."); 00217 int bin= find_bin(x[0]); 00218 if(bin < 0) 00219 if(x[0] < bin_positions[0]) 00220 return 1.0; 00221 else 00222 return 0.0; 00223 00224 if(x[0] < bin_positions[bin] && bin >= 1) 00225 return survival_values[bin-1] + (x[0] - bin_positions[bin-1]) * 00226 (survival_values[bin] - survival_values[bin-1]) / (bin_positions[bin] - bin_positions[bin-1]); 00227 00228 return survival_values[bin]; 00229 } 00230 00231 double HistogramDistribution::cdf(const Vec& x) const 00232 { 00233 return 1.0-survival_fn(x); 00234 } 00235 00236 void HistogramDistribution::expectation(Vec& mu) const 00237 { 00238 if(mu.size() != 1) 00239 PLERROR("HistogramDistribution::expectation implemented only for univariate data (vec size == 1)."); 00240 real sum= 0.0; 00241 for(int i= 0; i < bin_density.size(); ++i) 00242 sum+= bin_density[i] * (bin_positions[i+1]-bin_positions[i]) * (bin_positions[i]+bin_positions[i+1])/2; 00243 // sum+= bin_density[i] * bin_positions[i+1]; 00244 mu[0]=sum; 00245 } 00246 00247 void HistogramDistribution::variance(Mat& cov) const 00248 { 00249 if(cov.size() != 1) 00250 PLERROR("HistogramDistribution::variance implemented only for univariate data"); 00251 real sumsq= 0.0, sum= 0.0, s; 00252 int n= bin_density.size(); 00253 for(int i= 0; i < n; ++i) 00254 { 00255 s= bin_density[i] * (bin_positions[i+1]-bin_positions[i]) * (bin_positions[i]+bin_positions[i+1])/2; 00256 sum+= s; 00257 sumsq+= s*s; 00258 } 00259 cov(0,0) = abs(sumsq-(sum*sum)/n)/n; 00260 } 00261 00262 double HistogramDistribution::prob_in_range(const Vec& x0, const Vec& x1) const 00263 { 00264 return survival_fn(x0) - survival_fn(x1); 00265 } 00266 00267 00268 int HistogramDistribution::find_bin(real x) const 00269 { 00270 int b= 0, e= bin_positions.length()-2, p= b+(e-b)/2; 00271 00272 if(x < bin_positions[b] || x >= bin_positions[e+1]) 00273 return -1; 00274 00275 while(b < e) 00276 { 00277 if(bin_positions[p] == x) 00278 return p; 00279 if(bin_positions[p] > x) 00280 e= p-1; 00281 else 00282 b= p+1; 00283 p= b+(e-b)/2; 00284 } 00285 return p; 00286 } 00287 00288 void HistogramDistribution::calc_density_from_survival() 00289 { 00290 calc_density_from_survival(survival_values, bin_density, bin_positions); 00291 /* 00292 int n= bin_positions.length()-1; 00293 bin_density.resize(n); 00294 real sum= 0.0; 00295 for(int i= 0; i < n; ++i) 00296 if(bin_positions[i+1] != bin_positions[i]) 00297 if(i == n-1) 00298 sum+= (bin_density[i]= survival_values[i] / (bin_positions[i+1]-bin_positions[i])); 00299 else 00300 sum+= (bin_density[i]= (survival_values[i] - survival_values[i+1]) / (bin_positions[i+1]-bin_positions[i])); 00301 else 00302 bin_density[i]= 0.0; 00303 */ 00304 } 00305 00306 00307 void HistogramDistribution::calc_survival_from_density() 00308 { 00309 calc_survival_from_density(bin_density, survival_values, bin_positions); 00310 /* 00311 int n= bin_positions.length()-1; 00312 survival_values.resize(n); 00313 real prec= 0.0; 00314 for(int i= n-1; i >= 0; --i) 00315 prec= survival_values[i]= bin_density[i]*(bin_positions[i+1]-bin_positions[i]) + prec; 00316 for(int i= 0; i < n; ++i) 00317 survival_values[i]/= prec; 00318 */ 00319 } 00320 00321 void HistogramDistribution::calc_density_from_survival(const Vec& survival, Vec& density_, const Vec& positions) 00322 { 00323 int n= positions.length()-1; 00324 density_.resize(n); 00325 real sum= 0.0; 00326 for(int i= 0; i < n; ++i) 00327 if(positions[i+1] != positions[i]) 00328 if(i == n-1) 00329 sum+= (density_[i]= survival[i] / (positions[i+1]-positions[i])); 00330 else 00331 sum+= (density_[i]= (survival[i] - survival[i+1]) / (positions[i+1]-positions[i])); 00332 else 00333 density_[i]= 0.0; 00334 } 00335 00336 void HistogramDistribution::calc_survival_from_density(const Vec& density_, Vec& survival, const Vec& positions) 00337 { 00338 int n= positions.length()-1; 00339 survival.resize(n); 00340 real prec= 0.0; 00341 for(int i= n-1; i >= 0; --i) 00342 prec= survival[i]= density_[i]*(positions[i+1]-positions[i]) + prec; 00343 for(int i= 0; i < n; ++i) 00344 survival[i]/= prec; 00345 } 00346 00347 00348 } // end of namespace PLearn

Generated on Tue Aug 17 15:54:59 2004 for PLearn by doxygen 1.3.7