PLearn: FieldConvertCommand.cc Source File

00001 // -*- C++ -*- 00002 00003 // Copyright (C) 2004 Université de Montréal 00004 // 00005 // Redistribution and use in source and binary forms, with or without 00006 // modification, are permitted provided that the following conditions are met: 00007 // 00008 // 1. Redistributions of source code must retain the above copyright 00009 // notice, this list of conditions and the following disclaimer. 00010 // 00011 // 2. Redistributions in binary form must reproduce the above copyright 00012 // notice, this list of conditions and the following disclaimer in the 00013 // documentation and/or other materials provided with the distribution. 00014 // 00015 // 3. The name of the authors may not be used to endorse or promote 00016 // products derived from this software without specific prior written 00017 // permission. 00018 // 00019 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00020 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00021 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00022 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00023 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00024 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00025 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00026 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00027 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00028 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00029 // 00030 // This file is part of the PLearn library. For more information on the PLearn 00031 // library, go to the PLearn Web site at www.plearn.org 00032 00033 /* ******************************************************* 00034 * $Id: FieldConvertCommand.cc,v 1.34 2004/08/03 16:15:26 tihocan Exp $ 00035 ******************************************************* */ 00036 00037 #include "FieldConvertCommand.h" 00038 #include <plearn/db/getDataSet.h> 00039 #include <plearn/math/pl_erf.h> 00040 #include <plearn/math/random.h> 00041 #include <plearn/base/stringutils.h> 00042 #include <plearn/vmat/VMat.h> 00043 00044 #define NORMALIZE 1 00045 #define MISSING_BIT 2 00046 #define ONEHOT 4 00047 #define SKIP 16 00048 #define UNIFORMIZE 32 00049 00050 using namespace PLearn; 00051 00053 PLearnCommandRegistry FieldConvertCommand::reg_(new FieldConvertCommand); 00054 00055 00057 // FieldConvertCommand // 00059 FieldConvertCommand::FieldConvertCommand() 00060 : PLearnCommand("FieldConvert", 00061 00062 "Reads a dataset and generates a .vmat file based on the data, but optimized for training.\n", 00063 00064 "The nature of each field of the original dataset is automatically detected, and determines the approriate treatment.\n" 00065 "The possible field types with the corresponding treatment can be one of :\n" 00066 "continuous - quantitative data (data is real): the field is replaced by the normalized data (minus means, divided by stddev)\n" 00067 "binary - binary discrete data (is processed as a continuous field)\n" 00068 "discrete_uncorr - discrete integers (qualitative data, e.g : postal codes, categories) not corr. with target: the field is replaced by a group of fields in a one-hot fashion.\n" 00069 "discrete_corr - discrete integers, correlated with target : both the normalized and the onehot versions of the field are used in the new dataset\n" 00070 "constant - constant data : the field is skipped (it is not present in the new dataset)\n" 00071 "skip - irrelevant data : the field is skipped (it is not present in the new dataset)\n" 00072 "\n" 00073 "When there are ambiguities, messages are displayed for the problematic field(s) and they are skipped. The user must use a 'force' file,\n" 00074 "to explicitely force the types of the ambiguous field(s). The file is made of lines of the following possible formats:\n" 00075 "FIELDNAME=type\n" 00076 "fieldNumberA-fieldNumberB=type [e.g : 200-204=constant, to force a range]\n" 00077 "FIELDNAME+=\"processing\" (n_inputs) [to add a home-made processing after a field; the number of inputs thus added must be given]\n" 00078 "\n" 00079 "Note that for all types but skip, if the field contains missing values, an additionnal 'missing-bit' field is added and is '1' only for missing values.\n" 00080 "The difference between types constant and skip is only cosmetic: constant means the field is constant, while skip means either there are too many missing values or it has been forced to skip.\n" 00081 "A report file is generated and contains the information about the processing for each field.\n" 00082 "Target index of source needs to be specified (ie. to perform corelation test). It can be any field of the " 00083 "source dataset, but will be the last field of the new dataset.*** We assume target is never missing *** \n\n" 00084 "usage : FieldConvert\n" 00085 " *source = [source dataset]\n" 00086 " *destination = [new dataset with vmat extension]\n" 00087 " *target = [field index of target]\n" 00088 " force = [force file]\n" 00089 " report = [report file] (default = 'FieldConvertReport.txt')\n" 00090 " min_fraction = [if number of unique values is > than 'fraction' * NonMISSING -> the field is continuous]\n" 00091 " (default = 0.3)\n" 00092 " max_pvalue = [maximum pvalue to assume correlation with target] (default = 0.025)\n" 00093 " frac_missing_to_skip = [if MISSING >= 'frac_missing_to_skip * number of samples then this field is skipped]\n" 00094 " (default = 1.0)\n" 00095 " frac_enough = [if a field is discrete, only values represented by at least frac_enough * nSamples\n" 00096 " elements will be kept] (default = 0.005)\n" 00097 " precompute = [none | pmat | ... : possibly add a <PRECOMPUTE> tag in the destination] (default = none)\n" 00098 " discrete_tolerance = [if a discrete field has float values, its one hot mapping will be enlarged according to\n" 00099 " this factor] (default = 0.001)\n" 00100 " uniformize = [0 | 1 | 2: whether fields should be uniformized, 2 meaning all fields and 1 meaning only\n" 00101 " fields obviously not following a normal distribution] (default = 0)\n" 00102 "\n" 00103 "where fields with asterix * are not optional\n" 00104 ) 00105 {} 00106 00108 // run // 00110 void FieldConvertCommand::run(const vector<string> & args) 00111 { 00112 // set default values 00113 UNIQUE_NMISSING_FRACTION_TO_ASSUME_CONTINUOUS = 0.3; 00114 PVALUE_THRESHOLD = 0.025; 00115 FRAC_MISSING_TO_SKIP = 1.0; 00116 FRAC_ENOUGH = 0.005; 00117 DISCRETE_TOLERANCE = 1e-3; 00118 target = -1; 00119 report_fn="FieldConvertReport.txt"; 00120 precompute = "none"; 00121 int uniformize = 0; 00122 00123 for(int i=0;i<(signed)args.size();i++) 00124 { 00125 vector<string> val = split(args[i],"="); 00126 if(val.size()<2) 00127 PLERROR("bad argument: %s ",args[i].c_str()); 00128 if(val[0]=="source") 00129 source_fn=val[1]; 00130 else if(val[0]=="destination") 00131 desti_fn=val[1]; 00132 else if(val[0]=="target") 00133 target=toint(val[1]); 00134 else if(val[0]=="force") 00135 force_fn=val[1]; 00136 else if(val[0]=="report") 00137 report_fn=val[1]; 00138 else if(val[0]=="min_fraction") 00139 UNIQUE_NMISSING_FRACTION_TO_ASSUME_CONTINUOUS=toreal(val[1]); 00140 else if(val[0]=="max_pvalue") 00141 PVALUE_THRESHOLD=toreal(val[1]); 00142 else if(val[0]=="frac_missing_to_skip") 00143 FRAC_MISSING_TO_SKIP=toreal(val[1]); 00144 else if(val[0]=="discrete_tolerance") 00145 DISCRETE_TOLERANCE = toreal(val[1]); 00146 else if(val[0]=="uniformize") 00147 uniformize = toint(val[1]); 00148 else if(val[0]=="frac_enough") 00149 FRAC_ENOUGH=toreal(val[1]); 00150 else if(val[0]=="precompute") 00151 precompute = val[1]; 00152 else PLERROR("unknown argument: %s ",val[0].c_str()); 00153 } 00154 if(source_fn=="") 00155 PLERROR("you must specify source file"); 00156 if(desti_fn=="") 00157 PLERROR("you must specify destination .vmat"); 00158 if(target==-1) 00159 PLERROR("you must specify source target field index"); 00160 00161 // manual map between field index and types 00162 map<int, FieldType> force; 00163 map<int, string> additional_proc; 00164 map<int, int> additional_proc_size; 00165 00166 real beta_hat,student=-1; 00167 real correlation = -1; 00168 00169 VMat vm = getDataSet(source_fn); 00170 00171 // A vector where we store the indices of the fields to be uniformized. 00172 TVec<int> need_to_be_uniformized; 00173 00174 if (target < 0 || target > vm->width()) { 00175 PLERROR("The target column you specified is not valid"); 00176 } 00177 00178 // Compute the result inputsize as the preprocessing goes on. 00179 int inputsize = 0; 00180 00181 cout<<"### using field "<<target<<" as target"<<endl; 00182 00184 // read user custom operation from file 'force_fname' 00185 vector<string> forcelines; 00186 if(force_fn!="") 00187 forcelines = getNonBlankLines(loadFileAsString(force_fn)); 00188 for(int i=0; i<(signed)forcelines.size();i++) 00189 { 00190 size_t pos_of_equal = forcelines[i].find('='); 00191 if (pos_of_equal == string::npos) 00192 PLERROR("In FieldConvertCommand - A line in the force file does not contain the '=' character"); 00193 vector<string> vec(2); 00194 vec[0] = forcelines[i].substr(0, pos_of_equal); 00195 vec[1] = forcelines[i].substr(pos_of_equal + 1); 00196 /* cout << "vec[0] = " << vec[0] << endl; 00197 cout << "vec[1] = " << vec[1] << endl; */ 00198 vector<string> leftpart = split(vec[0],"-"); 00199 if (leftpart.size() == 1 && leftpart[0].substr(leftpart[0].size() - 1) == "+") { 00200 // Syntax: field+="processing" (number of inputs added) 00201 int field_index = vm->fieldIndex(leftpart[0].substr(0, leftpart[0].size() - 1)); 00202 if (field_index == -1) 00203 PLERROR("In FieldConvertCommand - A field was not found in the source VMatrix"); 00204 if (additional_proc[field_index] != "") 00205 PLERROR("In FieldConvertCommand - There can be only one additional processing specified for each field"); 00206 size_t last_open_par = vec[1].rfind('('); 00207 if (last_open_par == string::npos) 00208 PLERROR("In FieldConvertCommand - You must specify the number of inputs added in a processing"); 00209 string added_inputs = vec[1].substr(last_open_par + 1, vec[1].rfind(')') - last_open_par - 1); 00210 // cout << "added_inputs = " << added_inputs << endl; 00211 additional_proc_size[field_index] = toint(added_inputs); 00212 size_t first_comma = vec[1].find('"'); 00213 size_t last_comma = vec[1].rfind('"', last_open_par); 00214 additional_proc[field_index] = vec[1].substr(first_comma + 1, last_comma - first_comma - 1); 00215 // cout << "Processing added: " << additional_proc[field_index] << endl; 00216 } else { 00217 FieldType rpart = stringToFieldType(vec[1]); 00218 00219 if(leftpart.size()>1) 00220 { 00221 // we have a range 00222 int a = toint(leftpart[0]); 00223 int b = toint(leftpart[1]); 00224 00225 for(int j=a;j<=b;j++) 00226 force[j]=rpart; 00227 } 00228 else 00229 { 00230 if(vm->fieldIndex(vec[0])==-1) 00231 cout<<"field : "<<vec[0]<<" doesn't exist in matrix"<<endl; 00232 force[vm->fieldIndex(vec[0])] = rpart; 00233 } 00234 } 00235 } 00237 00238 TVec<StatsCollector> sc; 00239 sc = vm->getStats(); 00240 00241 ofstream* out; 00242 ofstream* out_uni = 0; 00243 string filename_non_uni = desti_fn + ".non_uniformized.vmat"; 00244 if (uniformize > 0) { 00245 // We write two files: the one with the preprocessing and another one with 00246 // the uniformization. 00247 out = new ofstream(filename_non_uni.c_str()); 00248 out_uni = new ofstream(desti_fn.c_str()); 00249 } else { 00250 out = new ofstream(desti_fn.c_str()); 00251 } 00252 ofstream report(report_fn.c_str()); 00253 *out<<"<SOURCES>\n"+source_fn+"\n</SOURCES>\n<PROCESSING>\n"; 00254 00255 // Minimun number of representants of a class to be considered significant. 00256 int n_enough = (int) (FRAC_ENOUGH * vm->length()); 00257 00258 ProgressBar* pb = new ProgressBar("Analyzing fields", vm->width()); 00259 00260 // Process each field. 00261 for(int i=0;i<vm->width();i++) 00262 { 00263 type=unknown; // At the beginning we don't know the type. 00264 beta_hat=0; 00265 string message; 00266 int action = 0; 00267 int count = (int)sc[i].getCounts()->size()-1; // Number of unique values. 00268 00269 // is this field's type forced ? 00270 if(force.find(i) != force.end()) 00271 type = force[i]; 00272 else if(i==target) 00273 // add target ONLY at the end of the process 00274 // (so it's the last column of the dataset) 00275 type=skip; 00276 00277 // Test for fields to be skipped, when not enough data is available. 00278 if(sc[i].nnonmissing() <= (1-FRAC_MISSING_TO_SKIP) * vm->length()) { 00279 if (type != unknown && type != skip && type != constant) { 00280 // We forced the type to something that should not be skipped. 00281 cout << "Warning: you forced the type of field number " << i << ", " 00282 << "but there are too many missing values so it'll be skipped. " 00283 << "If you want to keep it, you'll have to add it by hand to the resulting .vmat" 00284 << endl; 00285 } 00286 type=skip; 00287 } 00288 00289 // Test whether there are only 2 unique values: in this case, we don't 00290 // need a one hot, and we set it to binary (which will be processed the 00291 // same as continuous). 00292 if (count == 2 && type != skip) { 00293 Vec counts(2); 00294 int k = 0; 00295 for(map<real,StatsCollectorCounts>::iterator it = sc[i].getCounts()->begin(); k <= 1; ++it) { 00296 counts[k++] = it->second.n; 00297 } 00298 if (counts[0] >= n_enough && counts[1] >= n_enough) { 00299 if (type != unknown && type != binary) { 00300 cout << "Warning: type for field number " << i << " set to binary, " 00301 << "but you had forced it to something else." << endl; 00302 } 00303 type = binary; 00304 } else { 00305 // Not enough representants for one of the classes. 00306 if (type != unknown && type != skip) { 00307 cout << "Warning: field number " << i << " is binary but doesn't have " 00308 << "enough representants of each class, thus it'll be skipped, " 00309 << "even if you had forced it to some other type (edit the resulting " 00310 << ".vmat if you really want to add it)." << endl; 00311 } 00312 type = skip; 00313 // cout << "Skipped binary field " << i << " (counts_0 = " 00314 // << counts[0] << ", counts_1 = " << counts[1] << ")" << endl; 00315 } 00316 } 00317 00318 // Test for constant values. 00319 if(count<=1 && type != skip && type != constant) { 00320 if(sc[i].nmissing()>0 && sc[i].nmissing()<vm->length()) { 00321 // This case actually never occurs in the Bell database. 00322 // That's why we leave it to the user. 00323 message = "Constant field, but there are " + tostring(sc[i].nmissing()) + 00324 " missing values. Force the type, or modify this program !"; 00325 } 00326 else { 00327 // Either there is no missing value, or they are all missing. 00328 if (type != unknown) { 00329 cout << "Warning: field number " << i << " has been forced, but " 00330 << "appears to be constant. Edit the resulting .vmat if you " 00331 << "really want to add it." << endl; 00332 } 00333 type=constant; 00334 } 00335 } 00336 00337 // Test if there exist fractional parts. 00338 // This test has two goals: 00339 // - if we don't know the type, a fractional part indicates continuous data 00340 // - if the type is discrete, we need to be careful in the one hot ranges 00341 // because taking exact float values is not a good idea 00342 bool may_be_fraction = false; 00343 if (type == continuous || type == binary) { 00344 may_be_fraction = true; 00345 } else if (type != skip && type != constant) { 00346 int k = 0; 00347 for (map<real,StatsCollectorCounts>::iterator it = sc[i].getCounts()->begin(); k < count; ++it) { 00348 real val = it->first; 00349 k++; 00350 if((val-(int)val) != 0) 00351 { 00352 may_be_fraction = true; 00353 break; 00354 } 00355 } 00356 } 00357 00358 // Did we find the type already? 00359 if (type == unknown && message == "") 00360 { 00361 00362 if(sc[i].max()>-1000 && vm->getStringToRealMapping(i).size()>0) 00363 message="Field uses both string map & numerical values"; 00364 else if(sc[i].min() >= 0 && sc[i].max() >= 12000 && sc[i].max() <= 20000) { 00365 // Could be a numeric SAS date. 00366 // We first make sure they are all integer values. 00367 bool non_integer = false; 00368 for(int j=0;j<vm->length();j++) 00369 { 00370 real val = vm->get(j,i); 00371 if(!is_missing(val) && ((val-(int)val) > 0)) 00372 non_integer = true; 00373 } 00374 if (!non_integer) { 00375 message = "Looks like a numeric SAS date. If this is the case, first edit the source (.vmat) file to change the 'TextFilesVMatrix' field type (use sas_date), then edit force.txt to force the type to continuous. If it's not a date, please use force.txt to force the type."; 00376 } 00377 } 00378 else if(sc[i].min()>19700000 && sc[i].max()<20080000) 00379 // Could be a date between 1970 and 2008. 00380 message="Looks like a date. Edit the source file to change the 'TextFilesVMatrix' field type (use jdate). Otherwise, edit force.txt to force the type."; 00381 00382 // Test whether there are enough unique values to assume continuous data (having a string map implies discrete data) 00383 else if((count >= MIN( UNIQUE_NMISSING_FRACTION_TO_ASSUME_CONTINUOUS * sc[i].nnonmissing(), 2000)) 00384 && vm->getStringToRealMapping(i).size()==0) 00385 type=continuous; 00386 else { 00387 // if there are fractional parts, assume continuous 00388 if (may_be_fraction) { 00389 type=continuous; 00390 } 00391 } 00392 00393 // if the data doesn't look continuous (small numb. of unique 00394 // values and no fractional parts), 'type' still equals unknown. 00395 if(type==unknown && message=="") 00396 { 00397 // perform correlation test 00398 real sigma_hat=0,sigma_beta_hat=0; 00399 real xmean = sc[i].mean(); 00400 real ymean = sc[target].mean(); 00401 real x_minus_xmean_square=0; 00402 real y_minus_ymean_square=0; 00403 00404 int len_nm = 0; 00405 int len = vm->length(); 00406 00407 Vec x(len); 00408 Vec y(len); 00409 vm->getColumn(i, x); 00410 vm->getColumn(target, y); 00411 00412 // compute beta-hat 00413 for(int j=0;j<len;j++) 00414 if(!is_missing(x[j]) && !is_missing(y[j])) 00415 { 00416 real xdiff = x[j] - xmean; 00417 real ydiff = y[j] - ymean; 00418 beta_hat += xdiff * ydiff; 00419 x_minus_xmean_square += xdiff * xdiff; 00420 y_minus_ymean_square += ydiff * ydiff; 00421 len_nm++; 00422 } 00423 00424 // Correlation^2 = sum_xy^2 / (sum_xx * sum_yy). 00425 correlation = fabs(beta_hat) / sqrt(x_minus_xmean_square * y_minus_ymean_square); 00426 00427 beta_hat /= x_minus_xmean_square; 00428 00429 // compute sigma-hat 00430 for(int j=0;j<len;j++) 00431 if(!is_missing(x[j]) && !is_missing(y[j])) 00432 sigma_hat += square(y[j]-ymean - beta_hat*(x[j]-xmean)); 00433 sigma_hat /= len_nm-2; 00434 00435 sigma_beta_hat = sigma_hat / x_minus_xmean_square; 00436 00437 real t = beta_hat / sqrt(sigma_beta_hat); 00438 00439 student = 2 * student_t_cdf(-fabs(t), len_nm-2); 00440 if(student < PVALUE_THRESHOLD) 00441 { 00442 // then assume data is discrete but correlated 00443 type = discrete_corr; 00444 // cout<<"##"<<i<<": nonmiss:"<<sc[i].nnonmissing()<<" b:"<<beta_hat<<" sigma_beta_hat:"<<sigma_beta_hat<<" T:"<<student<<endl; 00445 } 00446 } 00447 00448 // If we're still not sure (that is to say, type==unknown && message==""). 00449 if(type==unknown && message=="") 00450 // is data 'uncorrelated + discrete + sparse'? Yes : Flag 00451 if((real)(sc[i].max()-sc[i].min()+1) > (real)(count)*2 ) { 00452 type=continuous; 00453 // cout << "Uncorrelated + discrete + sparse: " << i << " (max = " << sc[i].max() << ", min = " << sc[i].min() << ", count = " << count << ")" << endl; 00454 } 00455 else if((real)(sc[i].max()-sc[i].min()+1) != (real)(count) ) 00456 message = "(edit force.txt): Data is made of a semi-sparse (density<50%) distribution of integers (uncorrelated with target). max: "+tostring(sc[i].max())+" min:"+tostring(sc[i].min())+" count:"+tostring(count); 00457 else { 00458 // data is discrete, not sparse, and not correlated to target, 00459 // then simply make it as onehot 00460 type = discrete_uncorr; 00461 // cout << "Discrete uncorrelated: " << i << endl; 00462 } 00463 } 00464 00465 // Now find out which actions to perform according to type. 00466 00467 // We treat 'binary' as 'continuous'. 00468 if (type == binary) 00469 type = continuous; 00470 00471 if(type==unknown) 00472 cout<<tostring(i)+" ("+vm->fieldName(i)+") "<<message<<endl; 00473 else if(type==continuous) 00474 { 00475 action |= NORMALIZE; 00476 if(sc[i].nmissing()>0) 00477 action |= MISSING_BIT; 00478 } 00479 else if(type==discrete_uncorr) 00480 { 00481 action = ONEHOT; 00482 if(sc[i].nmissing()>0) 00483 action |= MISSING_BIT; 00484 } 00485 else if(type==skip || type==constant) 00486 { 00487 action = SKIP; 00488 } 00489 else if(type==discrete_corr) 00490 { 00491 action |= NORMALIZE; 00492 action |= ONEHOT; 00493 if(sc[i].nmissing()>0) 00494 action |= MISSING_BIT; 00495 } 00496 00497 // Perform actions. 00498 00499 if(action&NORMALIZE) 00500 { 00501 00502 *out << "@" << vm->fieldName(i) << " "; 00503 // Replace Nans by either the most frequent value or the mean. 00504 if(sc[i].nmissing()>0) 00505 { 00506 // find out 'mode' of the distribution, if any 00507 double maxi=-1; 00508 real missingval = -1; 00509 for(map<real,StatsCollectorCounts>::iterator it = sc[i].getCounts()->begin(); it!=sc[i].getCounts()->end(); ++it) 00510 if(it->second.n > maxi) 00511 { 00512 maxi=it->second.n; 00513 missingval=it->first; 00514 } 00515 if(maxi<10) 00516 // The most frequent value appears less than 10 times: a missing value is replaced by the mean. 00517 missingval=sc[i].mean(); 00518 else { 00519 // We replace a missing value by the most frequent value. 00520 // cout << i << ": maxi >= 10, and missingval = " << missingval << endl; 00521 } 00522 00523 *out << "isnan " << missingval << " @" << vm->fieldName(i) << " ifelse "; 00524 } 00525 00526 // Uniformize all fields when 'uniformize' is set to 2. 00527 bool to_uniformize = (uniformize == 2); 00528 // If this field violates the normal assumption, and the user set the 00529 // 'uniformize' option to 1, then we should keep this field intact, and 00530 // remember it will need to be uniformized in the final vmat. 00531 bool apply_normalization = true; 00532 if (uniformize == 1) { 00533 real max = sc[i].max(); 00534 real min = sc[i].min(); 00535 real mu = sc[i].mean(); 00536 real sigma = sc[i].stddev(); 00537 int nsamp = (int) sc[i].nnonmissing(); 00538 real confidence = 0.05; 00539 real alpha = gauss_01_quantile(pow((1 - confidence), 1 / real(nsamp))); 00540 if ( (max - mu) / sigma > alpha || (min - mu) / sigma < - alpha) { 00541 // Normal assumption violated. 00542 to_uniformize = true; 00543 } 00544 } 00545 if (to_uniformize) { 00546 action ^= NORMALIZE; // Remove the 'normalize' action. 00547 action |= UNIFORMIZE; // And add the 'uniformize' one. 00548 apply_normalization = false; 00549 *out << ":" << vm->fieldName(i) << endl; 00550 need_to_be_uniformized.append(inputsize); 00551 } 00552 00553 // And apply normalization if we still need to do it. 00554 if (apply_normalization) { 00555 real mu = sc[i].mean(); 00556 real sigma = sc[i].stddev(); 00557 *out << mu << " - " << sigma << " / :" << vm->fieldName(i)<<"\n"; 00558 } 00559 00560 // Increase the counter of inputs. 00561 inputsize++; 00562 } 00563 00564 int n_discarded = 0; 00565 if(action&ONEHOT) { 00566 // First see if any value must be discarded, because not present often 00567 // enough in the dataset. 00568 int k = 0; 00569 TVec<bool> to_be_included(count); 00570 for (int j = 0; j < count; j++) { 00571 to_be_included[j] = true; 00572 } 00573 for(map<real,StatsCollectorCounts>::iterator it = sc[i].getCounts()->begin(); k<((int)sc[i].getCounts()->size()) - 1; ++it) { 00574 if (it->second.n < n_enough) { 00575 to_be_included[k] = false; 00576 n_discarded++; 00577 // cout << "Field " << i << ": value " << it->first 00578 // << " discarded (n = " << it->second.n << ")." << endl; 00579 } 00580 k++; 00581 } 00582 if (n_discarded <= count - 1) { 00583 // We only consider this field if there is at least 1 class left. 00584 // TODO TMP 00585 // RealMapping rm = sc[i].getBinMapping(1,1); 00586 // out << "@" << vm->fieldName(i) << " " << rm << " onehot :" 00587 // << vm->fieldName(i) << ":0:" << rm.size() << endl; 00588 real tol = 0; 00589 if (may_be_fraction) { 00590 // We need to take a margin because of floating point precision. 00591 tol = DISCRETE_TOLERANCE; 00592 } 00593 RealMapping rm = sc[i].getAllValuesMapping(&to_be_included, 0, true, tol); 00594 *out << "@"<<vm->fieldName(i) <<" " << rm << " " 00595 << rm.size() << " onehot :" 00596 << vm->fieldName(i)<<"_:0:"<< (rm.size() - 1) << endl; 00597 /* out << "@"<<vm->fieldName(i) <<" " << sc[i].getAllValuesMapping(&to_be_included, 0, true) << " " 00598 << count - n_discarded << " onehot :" 00599 << vm->fieldName(i)<<"_:0:"<<(count - 1 - n_discarded) << endl; */ 00600 inputsize += count - n_discarded; 00601 } 00602 } 00603 00604 if(action&MISSING_BIT) 00605 { 00606 *out<<"@"<<vm->fieldName(i)<<" isnan 1 0 ifelse :"<<vm->fieldName(i)<<"_mbit\n"; 00607 inputsize++; 00608 } 00609 00610 report<<tostring(i)+" ("+vm->fieldName(i)+") [c="<<count<<" nm="<<sc[i].nnonmissing()<<"] "; 00611 if(action==0)report<<"~~user intervention required :"<<message; 00612 if(action&NORMALIZE) { 00613 report << "NORMALIZE "; 00614 /* if (countlog > 0) { 00615 report << "(after " << countlog << " log) "; 00616 }*/ 00617 } 00618 if (action & UNIFORMIZE) report << "UNIFORMIZE "; 00619 if (action&ONEHOT) report<<"ONEHOT("<<count<<") - discarded: " << n_discarded << " "; 00620 if (type==discrete_corr) report<<"correl: "<<correlation<<" 2tail-student:"<<student<<" "; 00621 if (action&MISSING_BIT) report<<"MISSING_BIT "; 00622 if (action&SKIP) report<<"SKIP "; 00623 if (additional_proc[i] != "") { 00624 // There is an additional processing to add after this field. 00625 *out << additional_proc[i] << endl; 00626 inputsize += additional_proc_size[i]; 00627 report << "ADD_PROC "; 00628 } 00629 report<<endl; 00630 00631 pb->update(i); 00632 00633 } 00634 00635 delete pb; 00636 00637 // Add the target. 00638 *out << "%" << target << " :target\n</PROCESSING>"<<endl; 00639 00640 // Add the sizes. 00641 *out << endl << "<SIZES>" << endl 00642 << inputsize << endl // inputsize 00643 << "1" << endl // targetsize 00644 << "0" << endl // weightsize 00645 << "</SIZES>" << endl; 00646 00647 // Now build the uniformized VMatrix if 'uniformize' has been set. 00648 if (uniformize > 0) { 00649 // Prepare the 'shift' and 'scale' vectors to map uniformized fields to 00650 // [-1,1] instead of default [0,1]. 00651 Vec shift(inputsize + 1); // +1 because of the target. 00652 Vec scale(inputsize + 1); 00653 shift.fill(0); 00654 scale.fill(1); 00655 for (int i = 0; i < need_to_be_uniformized.length(); i++) { 00656 shift[need_to_be_uniformized[i]] = -0.5; 00657 scale[need_to_be_uniformized[i]] = 2; 00658 } 00659 // Write the .vmat file. 00660 *out_uni << "# Preprocessed VMat" << endl; 00661 *out_uni << "<SOURCES>" << endl; 00662 *out_uni << "@" << endl 00663 << "ShiftAndRescaleVMatrix(" << endl 00664 << " automatic = 0" << endl 00665 << " shift = [" << shift << "]" << endl 00666 << " scale = [" << scale << "]" << endl 00667 << " underlying_vmat =" << endl; 00668 *out_uni << " PLearnerOutputVMatrix(" << endl; 00669 *out_uni << " train_learners = 1" << endl; 00670 *out_uni << " data = AutoVMatrix(specification = \"" << filename_non_uni << "\")" << endl; 00671 *out_uni << " learners = [" << endl; 00672 *out_uni << " UniformizeLearner(" << endl; 00673 *out_uni << " which_fieldnums = "; 00674 *out_uni << "[ " << need_to_be_uniformized << "]" << endl; 00675 *out_uni << " )" << endl; 00676 *out_uni << " ]" << endl; 00677 *out_uni << " )" << endl 00678 << ")" << endl; 00679 *out_uni << "</SOURCES>" << endl << endl; 00680 *out_uni << "<SIZES>" << endl 00681 << inputsize << endl // inputsize 00682 << "1" << endl // targetsize 00683 << "0" << endl // weightsize 00684 << "</SIZES>" << endl; 00685 } 00686 00687 // Possibly add the <PRECOMPUTE> tag. 00688 if (precompute != "none") { 00689 *out << endl << "<PRECOMPUTE>" << endl << precompute << endl << "</PRECOMPUTE>" << endl; 00690 if (uniformize > 0) { 00691 *out_uni << endl << "<PRECOMPUTE>" << endl << precompute << endl << "</PRECOMPUTE>" << endl; 00692 } 00693 } 00694 00695 // Free stuff. 00696 out->close(); 00697 delete out; 00698 if (uniformize > 0) { 00699 out_uni->close(); 00700 delete out_uni; 00701 } 00702 00703 } 00704 00706 // stringToFieldType // 00708 PLearn::FieldConvertCommand::FieldType FieldConvertCommand::stringToFieldType(string s) { 00709 if (s.find("continuous") != string::npos) 00710 return continuous; 00711 else if (s.find("discrete_uncorr")!= string::npos ) 00712 return discrete_uncorr; 00713 else if (s.find("discrete_corr") != string::npos) 00714 return discrete_corr; 00715 else if (s.find("constant") != string::npos) 00716 return constant; 00717 else if (s.find("binary") != string::npos) 00718 return binary; 00719 else if (s.find("skip") != string::npos) 00720 return skip; 00721 else { 00722 PLERROR("In FieldConvertCommand::stringToFieldType Unknown field type: %s",s.c_str()); 00723 return skip; 00724 } 00725 } 00726