00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00040
#include "HistogramDistribution.h"
00041
00042
00043
00044
namespace PLearn {
00045
using namespace std;
00046
00047 HistogramDistribution::HistogramDistribution() {}
00048
00049 HistogramDistribution::HistogramDistribution(
VMat data,
PP<Binner> binner_,
00050
PP<Smoother> smoother_)
00051 :bin_positions(data.length()+1), bin_density(data.length()), survival_values(data.length()),
00052 binner(binner_), smoother(smoother_)
00053 {
00054 setTrainingSet(data);
00055
train();
00056 }
00057
00058
PLEARN_IMPLEMENT_OBJECT(
HistogramDistribution,
00059
"Represents and possibly learns (using a smoother) a univariate distribution as a histogram.",
00060
"This class represents a univariate distribution with a set of bins and their densities\n"
00061
"The bins can be fixed or learned by a Binner object, and the densities\n"
00062
"can be learned from a training set. The empirical densities in the bins can also\n"
00063
"be smoothed with a Smoother (which is a general purpose univariate function\n"
00064
"smoothing mechanism. If the data is not univariate, then only the LAST column\n"
00065
"is considered. The smoother can either smooth the density or the survival fn.\n");
00066
00067 void HistogramDistribution::declareOptions(
OptionList& ol)
00068 {
00069
declareOption(ol,
"bin_positions", &HistogramDistribution::bin_positions, OptionBase::learntoption,
00070
"The n+1 positions that define n bins. There is one more bin position "
00071
"than number of bins, all the bins are supposed adjacent.");
00072
00073
declareOption(ol,
"bin_density", &HistogramDistribution::bin_density, OptionBase::learntoption,
00074
"Density of the distribution for each bin. The density is supposed "
00075
"constant within each bin:\n"
00076
"\t p(x) = bin_density[i] if bin_positions[i] < x <= bin_positions[i+1].");
00077
00078
declareOption(ol,
"survival_values", &HistogramDistribution::survival_values, OptionBase::learntoption,
00079
"Redundant with density is the pre-computed survival function.");
00080
00081
declareOption(ol,
"binner", &HistogramDistribution::binner, OptionBase::buildoption,
00082
"Used to do binning at training time (although a fixed binning scheme can be\n"
00083
"obtained by using a ManualBinner.B)");
00084
00085
declareOption(ol,
"smoother", &HistogramDistribution::smoother, OptionBase::buildoption,
00086
"Used to smooth learned density (or survival) at train time, after the empirical\n"
00087
"frequencies of each bin have been collected\n");
00088
00089
declareOption(ol,
"smooth_density_instead_of_survival_fn",
00090 &HistogramDistribution::smooth_density_instead_of_survival_fn, OptionBase::buildoption,
00091
"whether to smooth the density or the survival function, with the smoother\n");
00092
00093
00094 inherited::declareOptions(ol);
00095 }
00096
00097 void HistogramDistribution::build_()
00098 {
00099 }
00100
00101
00102 void HistogramDistribution::build()
00103 {
00104 inherited::build();
00105
build_();
00106 }
00107
00108 void HistogramDistribution::train()
00109 {
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
if(train_set->
width() !=
inputsize()+
targetsize())
00127
PLERROR(
"In HistogramDistribution::train(VMat training_set) training_set->width() != inputsize()+targetsize()");
00128
if(train_set->
width() != 1)
00129
PLERROR(
"In HistogramDistribution::train() train_set->width() must be 1 (column vec.)");
00130
if(
binner == 0)
00131
PLERROR(
"In HistogramDistribution::train() Can't train without a Binner.");
00132
00133
Vec data(train_set.
length());
00134 data << train_set.
getColumn(train_set.
width()-1);
00135
00136
PP<RealMapping> binning=
binner->getBinning(train_set);
00137 binning->setMappingForOther(0.0);
00138 binning->transform(data);
00139
00140
bin_positions= binning->getCutPoints();
00141
bin_density.
resize(
bin_positions.
length()-1);
00142
survival_values.
resize(
bin_positions.
length()-1);
00143
00144
for(
int i= 0; i < data.
length(); ++i)
00145 ++
survival_values[static_cast<int>(data[i])];
00146
for(
int i=
survival_values.
length()-2; i >= 0; --i)
00147
survival_values[i]+=
survival_values[i+1];
00148
for(
int i= survival_values.length()-1; i >= 0; --i)
00149 survival_values[i]/= survival_values[0];
00150
00151
if(
smoother)
00152 {
00153
if (
smooth_density_instead_of_survival_fn)
00154 {
00155
calc_density_from_survival();
00156
Vec df(
bin_density.
length());
00157 df <<
bin_density;
00158
smoother->smooth(df, bin_density,
bin_positions,
bin_positions);
00159
calc_survival_from_density();
00160 }
00161
else
00162 {
00163
Vec sv(survival_values.length());
00164 sv << survival_values;
00165
smoother->smooth(sv, survival_values,
bin_positions,
bin_positions);
00166
calc_density_from_survival();
00167 }
00168 }
00169
else
00170
calc_density_from_survival();
00171 }
00172
00173 void HistogramDistribution::computeOutput(
const Vec& input,
Vec& output)
00174 {
00175
if(input.
size() != 1 || output.
size() != 1)
00176
PLERROR(
"In HistogramDistribution::use implemented only for reals; i.e. input.size()=output.size()=1. "
00177
"Got input.size()=%d and output.size()=%d", input.
size(), output.
size());
00178
00179
if(outputs_def ==
"l") output[0]=
log_density(input);
00180
else if(outputs_def ==
"d") output[0]=
density(input);
00181
else if(outputs_def ==
"c") output[0]=
cdf(input);
00182
else if(outputs_def ==
"s") output[0]=
survival_fn(input);
00183
else if(outputs_def ==
"e") {
Vec mu(1);
expectation(mu); output[0]= mu[0]; }
00184
else if(outputs_def ==
"v") {
Mat m(1,1);
variance(m); output[0]= m(0,0); }
00185
else PLERROR(
"In HistogramDistribution::use unknown value for outputs_def= \"%s\"", outputs_def.c_str());
00186 }
00187
00188 void HistogramDistribution::makeDeepCopyFromShallowCopy(map<const void*, void*>& copies)
00189 {
00190 PLearner::makeDeepCopyFromShallowCopy(copies);
00191
00192
deepCopyField(
bin_positions, copies);
00193
deepCopyField(
bin_density, copies);
00194
deepCopyField(
survival_values, copies);
00195
deepCopyField(
binner, copies);
00196
deepCopyField(
smoother, copies);
00197 }
00198
00199 double HistogramDistribution::log_density(
const Vec& x)
const
00200
{
00201
return log(
density(
x));
00202 }
00203
00204
00205 double HistogramDistribution::density(
const Vec& x)
const
00206
{
00207
if(
x.size() != 1)
00208
PLERROR(
"HistogramDistribution::density implemented only for univariate data (vec size == 1).");
00209
return bin_density[
find_bin(
x[0])];
00210 }
00211
00212
00213 double HistogramDistribution::survival_fn(
const Vec& x)
const
00214
{
00215
if(
x.size() != 1)
00216
PLERROR(
"HistogramDistribution::survival_fn implemented only for univariate data (vec size == 1).");
00217
int bin=
find_bin(
x[0]);
00218
if(bin < 0)
00219
if(
x[0] <
bin_positions[0])
00220
return 1.0;
00221
else
00222
return 0.0;
00223
00224
if(
x[0] < bin_positions[bin] && bin >= 1)
00225
return survival_values[bin-1] + (
x[0] - bin_positions[bin-1]) *
00226 (
survival_values[bin] -
survival_values[bin-1]) / (bin_positions[bin] - bin_positions[bin-1]);
00227
00228
return survival_values[bin];
00229 }
00230
00231 double HistogramDistribution::cdf(
const Vec& x)
const
00232
{
00233
return 1.0-
survival_fn(
x);
00234 }
00235
00236 void HistogramDistribution::expectation(
Vec& mu)
const
00237
{
00238
if(mu.
size() != 1)
00239
PLERROR(
"HistogramDistribution::expectation implemented only for univariate data (vec size == 1).");
00240
real sum= 0.0;
00241
for(
int i= 0; i <
bin_density.
size(); ++i)
00242
sum+=
bin_density[i] * (
bin_positions[i+1]-bin_positions[i]) * (bin_positions[i]+bin_positions[i+1])/2;
00243
00244 mu[0]=
sum;
00245 }
00246
00247 void HistogramDistribution::variance(
Mat& cov)
const
00248
{
00249
if(cov.
size() != 1)
00250
PLERROR(
"HistogramDistribution::variance implemented only for univariate data");
00251
real sumsq= 0.0,
sum= 0.0, s;
00252
int n=
bin_density.
size();
00253
for(
int i= 0; i < n; ++i)
00254 {
00255 s=
bin_density[i] * (
bin_positions[i+1]-bin_positions[i]) * (bin_positions[i]+bin_positions[i+1])/2;
00256
sum+= s;
00257 sumsq+= s*s;
00258 }
00259 cov(0,0) =
abs(sumsq-(
sum*
sum)/n)/n;
00260 }
00261
00262 double HistogramDistribution::prob_in_range(
const Vec& x0,
const Vec& x1)
const
00263
{
00264
return survival_fn(x0) -
survival_fn(x1);
00265 }
00266
00267
00268 int HistogramDistribution::find_bin(
real x)
const
00269
{
00270
int b= 0, e=
bin_positions.
length()-2, p= b+(e-b)/2;
00271
00272
if(x < bin_positions[b] || x >=
bin_positions[e+1])
00273
return -1;
00274
00275
while(b < e)
00276 {
00277
if(bin_positions[p] ==
x)
00278
return p;
00279
if(bin_positions[p] >
x)
00280 e= p-1;
00281
else
00282 b= p+1;
00283 p= b+(e-b)/2;
00284 }
00285
return p;
00286 }
00287
00288 void HistogramDistribution::calc_density_from_survival()
00289 {
00290
calc_density_from_survival(
survival_values,
bin_density,
bin_positions);
00291
00292
00293
00294
00295
00296
00297
00298
00299
00300
00301
00302
00303
00304 }
00305
00306
00307 void HistogramDistribution::calc_survival_from_density()
00308 {
00309
calc_survival_from_density(
bin_density,
survival_values,
bin_positions);
00310
00311
00312
00313
00314
00315
00316
00317
00318
00319 }
00320
00321 void HistogramDistribution::calc_density_from_survival(
const Vec& survival,
Vec& density_,
const Vec& positions)
00322 {
00323
int n= positions.
length()-1;
00324 density_.
resize(n);
00325
real sum= 0.0;
00326
for(
int i= 0; i < n; ++i)
00327
if(positions[i+1] != positions[i])
00328
if(i == n-1)
00329
sum+= (density_[i]= survival[i] / (positions[i+1]-positions[i]));
00330
else
00331
sum+= (density_[i]= (survival[i] - survival[i+1]) / (positions[i+1]-positions[i]));
00332
else
00333 density_[i]= 0.0;
00334 }
00335
00336 void HistogramDistribution::calc_survival_from_density(
const Vec& density_,
Vec& survival,
const Vec& positions)
00337 {
00338
int n= positions.
length()-1;
00339 survival.
resize(n);
00340
real prec= 0.0;
00341
for(
int i= n-1; i >= 0; --i)
00342 prec= survival[i]= density_[i]*(positions[i+1]-positions[i]) + prec;
00343
for(
int i= 0; i < n; ++i)
00344 survival[i]/= prec;
00345 }
00346
00347
00348 }