PLearn: LiftStatsCollector.cc Source File

00001 // -*- C++ -*- 00002 00003 // PLearn (A C++ Machine Learning Library) 00004 // Copyright (C) 1998 Pascal Vincent 00005 // Copyright (C) 1999-2002 Pascal Vincent, Yoshua Bengio and University of Montreal 00006 // Copyright (C) 2003 Olivier Delalleau 00007 // 00008 00009 // Redistribution and use in source and binary forms, with or without 00010 // modification, are permitted provided that the following conditions are met: 00011 // 00012 // 1. Redistributions of source code must retain the above copyright 00013 // notice, this list of conditions and the following disclaimer. 00014 // 00015 // 2. Redistributions in binary form must reproduce the above copyright 00016 // notice, this list of conditions and the following disclaimer in the 00017 // documentation and/or other materials provided with the distribution. 00018 // 00019 // 3. The name of the authors may not be used to endorse or promote 00020 // products derived from this software without specific prior written 00021 // permission. 00022 // 00023 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00024 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00025 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00026 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00027 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00028 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00029 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00030 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00031 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00032 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00033 // 00034 // This file is part of the PLearn library. For more information on the PLearn 00035 // library, go to the PLearn Web site at www.plearn.org 00036 00037 /* ******************************************************* 00038 * $Id: LiftStatsCollector.cc,v 1.12 2004/04/14 16:30:15 plearner Exp $ 00039 * This file is part of the PLearn library. 00040 ******************************************************* */ 00041 00044 #include "LiftStatsCollector.h" 00045 #include "TMat_maths.h" 00046 00047 namespace PLearn { 00048 using namespace std; 00049 00051 // LiftStatsCollector // 00053 LiftStatsCollector::LiftStatsCollector() 00054 : inherited(), 00055 count_fin(0), 00056 is_finalized(false), 00057 nstored(0), 00058 nsamples(0), 00059 npos(0), 00060 output_column_index(0), 00061 lift_fraction(0.1), 00062 opposite_lift(0), 00063 output_column(""), 00064 sign_trick(0), 00065 target_column(1), 00066 verbosity(0) 00067 { 00068 } 00069 00071 // Object stuff // 00073 PLEARN_IMPLEMENT_OBJECT( 00074 LiftStatsCollector, 00075 "Computes the performance of a binary classifier", 00076 "The following statistics can be requested out of getStat():\n" 00077 "- LIFT = % of positive examples in the first n samples, divided by the % of positive examples in the whole database\n" 00078 "- LIFT_MAX = best performance that could be achieved, if all positive examples were selected in the first n samples\n" 00079 "(where n = lift_fraction * nsamples).\n" 00080 "IMPORTANT: if you add more samples after you call finalize() (or get any of the statistics above), some samples may\n" 00081 "be wrongly discarded and further statistics may be wrong\n\n" 00082 "Here are the typical steps to follow to optimize the lift with a neural network:\n" 00083 "- add a lift_output cost to cost_funcs (e.g. cost_funcs = [ \"stable_cross_entropy\" \"lift_output\"];)\n" 00084 "- change the template_stats_collector of your PTester:\n" 00085 " template_stats_collector =\n" 00086 " LiftStatsCollector (\n" 00087 " output_column = \"lift_output\" ;\n" 00088 " opposite_lift = 1 ; # if you want to optimize the lift\n" 00089 " sign_trick = 1 ;\n" 00090 " )\n" 00091 "- add the lift to its statnames:\n" 00092 " statnames = [ \"E[train.E[stable_cross_entropy]]\",\"E[test.E[stable_cross_entropy]]\",\n" 00093 " \"E[train.LIFT]\", \"E[test.LIFT]\" ]\n" 00094 "- maybe also change which_cost in your HyperOptimize strategy.\n" 00095 00096 ); 00097 00098 void LiftStatsCollector::declareOptions(OptionList& ol) 00099 { 00100 00101 declareOption(ol, "count_fin", &LiftStatsCollector::count_fin, OptionBase::learntoption, 00102 " the number of times finalize() has been called since the last forget()"); 00103 00104 declareOption(ol, "lift_fraction", &LiftStatsCollector::lift_fraction, OptionBase::buildoption, 00105 " the % of samples to consider (default = 0.1)\n"); 00106 00107 declareOption(ol, "opposite_lift", &LiftStatsCollector::opposite_lift, OptionBase::buildoption, 00108 " if set to 1, the LIFT stat will return -LIFT, so that it can be considered as a cost (default = 0)\n"); 00109 00110 declareOption(ol, "output_column", &LiftStatsCollector::output_column, OptionBase::buildoption, 00111 " the name of the column in which is the output value (the default value, \"\", assumes it is the first column))\n"); 00112 00113 declareOption(ol, "sign_trick", &LiftStatsCollector::sign_trick, OptionBase::buildoption, 00114 " if set to 1, then you won't have to specify a target column: if the output is\n" 00115 " negative, the target will be assumed to be 0, and 1 otherwise - and in both cases\n" 00116 " we only consider the absolute value of the output\n" 00117 " (default = 0)\n" 00118 ); 00119 00120 declareOption(ol, "target_column", &LiftStatsCollector::target_column, OptionBase::buildoption, 00121 " the column in which is the target value (default = 1)\n"); 00122 00123 declareOption(ol, "verbosity", &LiftStatsCollector::verbosity, OptionBase::buildoption, 00124 " to be set >= 2 in order to display more info (default = 0)\n"); 00125 00126 // Now call the parent class' declareOptions 00127 inherited::declareOptions(ol); 00128 } 00129 00131 // build // 00133 void LiftStatsCollector::build() 00134 { 00135 inherited::build(); 00136 build_(); 00137 } 00138 00140 // build_ // 00142 void LiftStatsCollector::build_() 00143 { 00144 if (output_column != "") { 00145 int i = this->getFieldNum(output_column); 00146 if (i >= 0) { 00147 output_column_index = i; 00148 } else { 00149 // Not found. 00150 output_column_index = 0; 00151 } 00152 } else { 00153 output_column_index = 0; 00154 } 00155 } 00156 00158 // computeLift // 00160 real LiftStatsCollector::computeLift() { 00161 if (!is_finalized) 00162 finalize(); 00163 // Compute statistics. 00164 00165 int npos_in_n_first = (int) sum(n_first_updates.column(1)); 00166 real first_samples_perf = npos_in_n_first/ (real) n_samples_to_keep; 00167 real targets_perf = (npos_in_n_first + npos) / (real) nsamples; 00168 real lift = first_samples_perf/targets_perf*100.0; 00169 if (verbosity >= 10) { 00170 cout << "LiftStatsCollector : is_finalized=" << is_finalized << ", nstored=" 00171 << nstored << ", nsamples=" << nsamples << ", npos=" << npos 00172 << ", n_samples_to_keep=" << n_samples_to_keep << ", lift_fraction=" 00173 << lift_fraction << ", output_column=" << output_column << ", sign_trick=" 00174 << sign_trick << ", target_column=" << target_column << ", verbosity= " 00175 << verbosity << endl; 00176 } 00177 if (verbosity >= 2) { 00178 cout << "There is a total of " << npos_in_n_first + npos << 00179 " positive examples to discover." << endl; 00180 cout << "The learner found " << npos_in_n_first << 00181 " of them in the fraction considered (" << lift_fraction << ")." << endl; 00182 } 00183 if (opposite_lift == 1) { 00184 return -lift; 00185 } 00186 return lift; 00187 } 00188 00190 // computeLiftMax // 00192 real LiftStatsCollector::computeLiftMax() { 00193 if (!is_finalized) 00194 finalize(); 00195 int npos_in_n_first = (int) sum(n_first_updates.column(1)); 00196 real nones = npos_in_n_first + npos; 00197 real max_first_samples_perf = 00198 MIN(nones,(real)n_samples_to_keep) / (real) n_samples_to_keep; 00199 real targets_perf = (npos_in_n_first + npos) / (real) nsamples; 00200 real max_lift = max_first_samples_perf/targets_perf*100.0; 00201 return max_lift; 00202 } 00203 00205 // finalize // 00207 void LiftStatsCollector::finalize() 00208 { 00209 n_first_updates.resize(nstored,2); // get rid of the extra space allocated. 00210 00211 n_samples_to_keep = int(lift_fraction*nsamples); 00212 00213 if (nstored > n_samples_to_keep) { 00214 // If not, then no change has to be made to n_first_updates. 00215 00216 // Make sure the highest ouputs are in the last n_samples_to_keep elements 00217 // of n_first_updates. 00218 if (n_samples_to_keep > 0) { 00219 selectAndOrder(n_first_updates, nstored - n_samples_to_keep); 00220 } 00221 00222 // Count the number of positive examples in the lowest outputs. 00223 for (int i = 0; i < nstored - n_samples_to_keep; i++) { 00224 if (n_first_updates(i,1) == 1) { 00225 npos++; 00226 } 00227 } 00228 00229 // Clear the lowest outputs, that are now useless. 00230 for (int i = 0; i < n_samples_to_keep; i++) { 00231 n_first_updates(i,0) = n_first_updates(i + nstored - n_samples_to_keep, 0); 00232 n_first_updates(i,1) = n_first_updates(i + nstored - n_samples_to_keep, 1); 00233 } 00234 n_first_updates.resize(n_samples_to_keep, 2); 00235 nstored = n_samples_to_keep; 00236 } 00237 00238 inherited::finalize(); 00239 is_finalized = true; 00240 count_fin++; 00241 if (verbosity >= 10) { 00242 cout << "Called finalized " << count_fin << " times" << endl; 00243 } 00244 } 00245 00247 // forget // 00249 void LiftStatsCollector::forget() 00250 { 00251 is_finalized = false; 00252 nstored = 0; 00253 npos = 0; 00254 nsamples = 0; 00255 n_first_updates.resize(0,0); 00256 n_first_updates.resize(1000,2); 00257 inherited::forget(); 00258 count_fin = 0; 00259 } 00260 00262 // getStat // 00264 double LiftStatsCollector::getStat(const string& statspec) 00265 { 00266 PIStringStream str(statspec); 00267 string parsed; 00268 str.smartReadUntilNext("(",parsed); 00269 if (parsed == "LIFT") { 00270 return computeLift(); 00271 } 00272 else if (parsed == "LIFT_MAX") { 00273 return computeLiftMax(); 00274 } 00275 else 00276 return inherited::getStat(statspec); 00277 } 00278 00280 // makeDeepCopyFromShallowCopy // 00282 void LiftStatsCollector::makeDeepCopyFromShallowCopy(map<const void*, void*>& copies) 00283 { 00284 inherited::makeDeepCopyFromShallowCopy(copies); 00285 deepCopyField(n_first_updates, copies); 00286 } 00287 00289 // update // 00291 void LiftStatsCollector::update(const Vec& x, real w) 00292 { 00293 if (count_fin > 0) { 00294 PLWARNING("In LiftStatsCollector::update - Called update after finalize (see help of LiftStatsCollector)"); 00295 } 00296 if (nstored == n_first_updates.length()) { 00297 n_first_updates.resize(MAX(1000,10*n_first_updates.length()), 2); 00298 } 00299 real output_val = x[output_column_index]; 00300 if (is_missing(output_val)) { 00301 // Missing value: we just discard it. 00302 is_finalized = false; 00303 inherited::update(x,w); 00304 return; 00305 } 00306 real target = -1; 00307 switch(sign_trick) { 00308 case 0: 00309 // Normal behavior. 00310 n_first_updates(nstored, 0) = output_val; 00311 target = x[target_column]; 00312 break; 00313 case 1: 00314 // Sign trick. 00315 n_first_updates(nstored, 0) = FABS(output_val); 00316 if (output_val <= 0) { 00317 x[output_column_index] = -output_val; 00318 target = 0; 00319 } else { 00320 target = 1; 00321 // cout << "Positive example : " << x << " (output_val = " << output_val << ")" << endl; 00322 } 00323 break; 00324 default: 00325 PLERROR("Wrong value for sign_trick in LiftStatsCollector"); 00326 break; 00327 } 00328 n_first_updates(nstored, 1) = target; 00329 if (target != 0 && target != 1) { 00330 PLERROR("In LiftStatsCollector::update - Target must be 0 or 1 !"); 00331 } 00332 nsamples++; 00333 nstored++; 00334 is_finalized = false; 00335 00336 inherited::update(x,w); 00337 } 00338 00339 } // end of namespace PLearn