00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00044
#include "LiftStatsCollector.h"
00045
#include "TMat_maths.h"
00046
00047
namespace PLearn {
00048
using namespace std;
00049
00051
00053 LiftStatsCollector::LiftStatsCollector()
00054 :
inherited(),
00055 count_fin(0),
00056 is_finalized(false),
00057 nstored(0),
00058 nsamples(0),
00059 npos(0),
00060 output_column_index(0),
00061 lift_fraction(0.1),
00062 opposite_lift(0),
00063 output_column(""),
00064 sign_trick(0),
00065 target_column(1),
00066 verbosity(0)
00067 {
00068 }
00069
00071
00073
PLEARN_IMPLEMENT_OBJECT(
00074
LiftStatsCollector,
00075
"Computes the performance of a binary classifier",
00076
"The following statistics can be requested out of getStat():\n"
00077
"- LIFT = % of positive examples in the first n samples, divided by the % of positive examples in the whole database\n"
00078
"- LIFT_MAX = best performance that could be achieved, if all positive examples were selected in the first n samples\n"
00079
"(where n = lift_fraction * nsamples).\n"
00080
"IMPORTANT: if you add more samples after you call finalize() (or get any of the statistics above), some samples may\n"
00081
"be wrongly discarded and further statistics may be wrong\n\n"
00082
"Here are the typical steps to follow to optimize the lift with a neural network:\n"
00083
"- add a lift_output cost to cost_funcs (e.g. cost_funcs = [ \"stable_cross_entropy\" \"lift_output\"];)\n"
00084
"- change the template_stats_collector of your PTester:\n"
00085
" template_stats_collector =\n"
00086
" LiftStatsCollector (\n"
00087
" output_column = \"lift_output\" ;\n"
00088
" opposite_lift = 1 ; # if you want to optimize the lift\n"
00089
" sign_trick = 1 ;\n"
00090
" )\n"
00091
"- add the lift to its statnames:\n"
00092
" statnames = [ \"E[train.E[stable_cross_entropy]]\",\"E[test.E[stable_cross_entropy]]\",\n"
00093
" \"E[train.LIFT]\", \"E[test.LIFT]\" ]\n"
00094
"- maybe also change which_cost in your HyperOptimize strategy.\n"
00095
00096 );
00097
00098 void LiftStatsCollector::declareOptions(
OptionList& ol)
00099 {
00100
00101
declareOption(ol,
"count_fin", &LiftStatsCollector::count_fin, OptionBase::learntoption,
00102
" the number of times finalize() has been called since the last forget()");
00103
00104
declareOption(ol,
"lift_fraction", &LiftStatsCollector::lift_fraction, OptionBase::buildoption,
00105
" the % of samples to consider (default = 0.1)\n");
00106
00107
declareOption(ol,
"opposite_lift", &LiftStatsCollector::opposite_lift, OptionBase::buildoption,
00108
" if set to 1, the LIFT stat will return -LIFT, so that it can be considered as a cost (default = 0)\n");
00109
00110
declareOption(ol,
"output_column", &LiftStatsCollector::output_column, OptionBase::buildoption,
00111
" the name of the column in which is the output value (the default value, \"\", assumes it is the first column))\n");
00112
00113
declareOption(ol,
"sign_trick", &LiftStatsCollector::sign_trick, OptionBase::buildoption,
00114
" if set to 1, then you won't have to specify a target column: if the output is\n"
00115
" negative, the target will be assumed to be 0, and 1 otherwise - and in both cases\n"
00116
" we only consider the absolute value of the output\n"
00117
" (default = 0)\n"
00118 );
00119
00120
declareOption(ol,
"target_column", &LiftStatsCollector::target_column, OptionBase::buildoption,
00121
" the column in which is the target value (default = 1)\n");
00122
00123
declareOption(ol,
"verbosity", &LiftStatsCollector::verbosity, OptionBase::buildoption,
00124
" to be set >= 2 in order to display more info (default = 0)\n");
00125
00126
00127 inherited::declareOptions(ol);
00128 }
00129
00131
00133 void LiftStatsCollector::build()
00134 {
00135 inherited::build();
00136
build_();
00137 }
00138
00140
00142 void LiftStatsCollector::build_()
00143 {
00144
if (
output_column !=
"") {
00145
int i = this->getFieldNum(
output_column);
00146
if (i >= 0) {
00147
output_column_index = i;
00148 }
else {
00149
00150
output_column_index = 0;
00151 }
00152 }
else {
00153
output_column_index = 0;
00154 }
00155 }
00156
00158
00160 real LiftStatsCollector::computeLift() {
00161
if (!
is_finalized)
00162
finalize();
00163
00164
00165
int npos_in_n_first = (
int)
sum(
n_first_updates.
column(1));
00166
real first_samples_perf = npos_in_n_first/ (
real)
n_samples_to_keep;
00167
real targets_perf = (npos_in_n_first +
npos) / (
real)
nsamples;
00168
real lift = first_samples_perf/targets_perf*100.0;
00169
if (
verbosity >= 10) {
00170 cout <<
"LiftStatsCollector : is_finalized=" <<
is_finalized <<
", nstored="
00171 <<
nstored <<
", nsamples=" <<
nsamples <<
", npos=" <<
npos
00172 <<
", n_samples_to_keep=" <<
n_samples_to_keep <<
", lift_fraction="
00173 <<
lift_fraction <<
", output_column=" <<
output_column <<
", sign_trick="
00174 <<
sign_trick <<
", target_column=" <<
target_column <<
", verbosity= "
00175 <<
verbosity <<
endl;
00176 }
00177
if (
verbosity >= 2) {
00178 cout <<
"There is a total of " << npos_in_n_first +
npos <<
00179
" positive examples to discover." <<
endl;
00180 cout <<
"The learner found " << npos_in_n_first <<
00181
" of them in the fraction considered (" <<
lift_fraction <<
")." <<
endl;
00182 }
00183
if (
opposite_lift == 1) {
00184
return -lift;
00185 }
00186
return lift;
00187 }
00188
00190
00192 real LiftStatsCollector::computeLiftMax() {
00193
if (!
is_finalized)
00194
finalize();
00195
int npos_in_n_first = (
int)
sum(
n_first_updates.
column(1));
00196
real nones = npos_in_n_first +
npos;
00197
real max_first_samples_perf =
00198
MIN(nones,(
real)
n_samples_to_keep) / (
real) n_samples_to_keep;
00199
real targets_perf = (npos_in_n_first + npos) / (
real)
nsamples;
00200
real max_lift = max_first_samples_perf/targets_perf*100.0;
00201
return max_lift;
00202 }
00203
00205
00207 void LiftStatsCollector::finalize()
00208 {
00209
n_first_updates.
resize(
nstored,2);
00210
00211
n_samples_to_keep =
int(
lift_fraction*
nsamples);
00212
00213
if (
nstored >
n_samples_to_keep) {
00214
00215
00216
00217
00218
if (
n_samples_to_keep > 0) {
00219
selectAndOrder(
n_first_updates,
nstored -
n_samples_to_keep);
00220 }
00221
00222
00223
for (
int i = 0; i <
nstored -
n_samples_to_keep; i++) {
00224
if (
n_first_updates(i,1) == 1) {
00225
npos++;
00226 }
00227 }
00228
00229
00230
for (
int i = 0; i < n_samples_to_keep; i++) {
00231
n_first_updates(i,0) =
n_first_updates(i +
nstored - n_samples_to_keep, 0);
00232
n_first_updates(i,1) =
n_first_updates(i +
nstored - n_samples_to_keep, 1);
00233 }
00234
n_first_updates.
resize(n_samples_to_keep, 2);
00235
nstored = n_samples_to_keep;
00236 }
00237
00238 inherited::finalize();
00239
is_finalized =
true;
00240
count_fin++;
00241
if (
verbosity >= 10) {
00242 cout <<
"Called finalized " <<
count_fin <<
" times" <<
endl;
00243 }
00244 }
00245
00247
00249 void LiftStatsCollector::forget()
00250 {
00251
is_finalized =
false;
00252
nstored = 0;
00253
npos = 0;
00254
nsamples = 0;
00255
n_first_updates.
resize(0,0);
00256
n_first_updates.
resize(1000,2);
00257 inherited::forget();
00258
count_fin = 0;
00259 }
00260
00262
00264 double LiftStatsCollector::getStat(
const string& statspec)
00265 {
00266
PIStringStream str(statspec);
00267
string parsed;
00268 str.
smartReadUntilNext(
"(",parsed);
00269
if (parsed ==
"LIFT") {
00270
return computeLift();
00271 }
00272
else if (parsed ==
"LIFT_MAX") {
00273
return computeLiftMax();
00274 }
00275
else
00276
return inherited::getStat(statspec);
00277 }
00278
00280
00282 void LiftStatsCollector::makeDeepCopyFromShallowCopy(map<const void*, void*>& copies)
00283 {
00284 inherited::makeDeepCopyFromShallowCopy(copies);
00285
deepCopyField(
n_first_updates, copies);
00286 }
00287
00289
00291 void LiftStatsCollector::update(
const Vec& x,
real w)
00292 {
00293
if (
count_fin > 0) {
00294
PLWARNING(
"In LiftStatsCollector::update - Called update after finalize (see help of LiftStatsCollector)");
00295 }
00296
if (
nstored ==
n_first_updates.
length()) {
00297
n_first_updates.
resize(
MAX(1000,10*
n_first_updates.
length()), 2);
00298 }
00299
real output_val =
x[
output_column_index];
00300
if (
is_missing(output_val)) {
00301
00302
is_finalized =
false;
00303 inherited::update(
x,w);
00304
return;
00305 }
00306
real target = -1;
00307
switch(
sign_trick) {
00308
case 0:
00309
00310
n_first_updates(
nstored, 0) = output_val;
00311 target =
x[
target_column];
00312
break;
00313
case 1:
00314
00315
n_first_updates(
nstored, 0) =
FABS(output_val);
00316
if (output_val <= 0) {
00317
x[
output_column_index] = -output_val;
00318 target = 0;
00319 }
else {
00320 target = 1;
00321
00322 }
00323
break;
00324
default:
00325
PLERROR(
"Wrong value for sign_trick in LiftStatsCollector");
00326
break;
00327 }
00328
n_first_updates(
nstored, 1) = target;
00329
if (target != 0 && target != 1) {
00330
PLERROR(
"In LiftStatsCollector::update - Target must be 0 or 1 !");
00331 }
00332
nsamples++;
00333
nstored++;
00334
is_finalized =
false;
00335
00336 inherited::update(
x,w);
00337 }
00338
00339 }