00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
#include "FieldConvertCommand.h"
00038
#include <plearn/db/getDataSet.h>
00039
#include <plearn/math/pl_erf.h>
00040
#include <plearn/math/random.h>
00041
#include <plearn/base/stringutils.h>
00042
#include <plearn/vmat/VMat.h>
00043
00044 #define NORMALIZE 1
00045 #define MISSING_BIT 2
00046 #define ONEHOT 4
00047 #define SKIP 16
00048 #define UNIFORMIZE 32
00049
00050
using namespace PLearn;
00051
00053
PLearnCommandRegistry FieldConvertCommand::reg_(
new FieldConvertCommand);
00054
00055
00057
00059 FieldConvertCommand::FieldConvertCommand()
00060 :
PLearnCommand("FieldConvert",
00061
00062 "Reads a dataset and generates a .vmat file based on the data, but optimized for training.\n",
00063
00064 "The nature of each field of the original dataset is automatically detected, and determines the approriate treatment.\n"
00065 "The possible field types with the corresponding treatment can be one of :\n"
00066 "continuous - quantitative data (data is
real): the field is replaced by the normalized data (
minus means, divided by stddev)\n"
00067 "binary - binary discrete data (is processed as a continuous field)\n"
00068 "discrete_uncorr - discrete integers (qualitative data, e.g : postal codes, categories) not corr. with target: the field is replaced by a group of fields in a one-hot fashion.\n"
00069 "discrete_corr - discrete integers, correlated with target : both the normalized and the
onehot versions of the field are used in the new dataset\n"
00070 "constant - constant data : the field is skipped (it is not present in the new dataset)\n"
00071 "skip - irrelevant data : the field is skipped (it is not present in the new dataset)\n"
00072 "\n"
00073 "When there are ambiguities, messages are displayed for the problematic field(s) and they are skipped. The user must
use a 'force' file,\n"
00074 "to explicitely force the types of the ambiguous field(s). The file is made of lines of the following possible formats:\n"
00075 "FIELDNAME=type\n"
00076 "fieldNumberA-fieldNumberB=type [e.g : 200-204=constant, to force a range]\n"
00077 "FIELDNAME+=\"processing\" (n_inputs) [to
add a home-made processing after a field; the number of inputs thus added must be given]\n
"
00078
"\n
"
00079
"Note that
for all types but skip,
if the field contains missing values, an additionnal 'missing-bit' field is added and is
'1' only
for missing values.\n
"
00080
"The
difference between types constant and skip is only cosmetic: constant means the field is constant,
while skip means either there are too many missing values or it has been forced to skip.\n
"
00081
"A report file is generated and contains the information about the processing
for each field.\n
"
00082
"Target index of source needs to be specified (ie. to perform corelation test). It can be any field of the
"
00083
"source dataset, but will be the last field of the
new dataset.*** We assume target is never missing *** \n\n
"
00084
"usage : FieldConvert\n
"
00085
" *source = [source dataset]\n
"
00086
" *destination = [
new dataset with vmat extension]\n
"
00087
" *target = [field index of target]\n
"
00088
" force = [force file]\n
"
00089
" report = [report file] (
default = 'FieldConvertReport.txt')\n
"
00090
" min_fraction = [
if number of unique values is > than 'fraction' * NonMISSING -> the field is continuous]\n
"
00091
" (
default = 0.3)\n
"
00092
" max_pvalue = [maximum pvalue to assume
correlation with target] (
default = 0.025)\n
"
00093
" frac_missing_to_skip = [
if MISSING >= 'frac_missing_to_skip * number of samples then
this field is skipped]\n
"
00094
" (
default = 1.0)\n
"
00095
" frac_enough = [
if a field is discrete, only values represented by at least frac_enough * nSamples\n
"
00096
" elements will be kept] (
default = 0.005)\n
"
00097
" precompute = [none | pmat | ... : possibly
add a <PRECOMPUTE> tag in the destination] (
default = none)\n
"
00098
" discrete_tolerance = [
if a discrete field has
float values, its one hot mapping will be enlarged according to\n
"
00099
" this factor] (
default = 0.001)\n
"
00100
" uniformize = [0 | 1 | 2: whether fields should be uniformized, 2 meaning all fields and 1 meaning only\n
"
00101
" fields obviously not following a
normal distribution] (
default = 0)\n
"
00102
"\n
"
00103
"where fields with asterix * are not optional\n
"
00104
)
00105
{}
00106
00108
// run //
00110 void FieldConvertCommand::run(const vector<string> & args)
00111
{
00112
// set default values
00113
UNIQUE_NMISSING_FRACTION_TO_ASSUME_CONTINUOUS = 0.3;
00114
PVALUE_THRESHOLD = 0.025;
00115
FRAC_MISSING_TO_SKIP = 1.0;
00116
FRAC_ENOUGH = 0.005;
00117
DISCRETE_TOLERANCE = 1e-3;
00118
target = -1;
00119
report_fn="FieldConvertReport.txt
";
00120
precompute = "none
";
00121
int uniformize = 0;
00122
00123
for(int i=0;i<(signed)args.size();i++)
00124
{
00125
vector<string> val = split(args[i],"=
");
00126
if(val.size()<2)
00127
PLERROR("bad argument: %s
",args[i].c_str());
00128
if(val[0]=="source
")
00129
source_fn=val[1];
00130
else if(val[0]=="destination
")
00131
desti_fn=val[1];
00132
else if(val[0]=="target
")
00133
target=toint(val[1]);
00134
else if(val[0]=="force
")
00135
force_fn=val[1];
00136
else if(val[0]=="report
")
00137
report_fn=val[1];
00138
else if(val[0]=="min_fraction
")
00139
UNIQUE_NMISSING_FRACTION_TO_ASSUME_CONTINUOUS=toreal(val[1]);
00140
else if(val[0]=="max_pvalue
")
00141
PVALUE_THRESHOLD=toreal(val[1]);
00142
else if(val[0]=="frac_missing_to_skip
")
00143
FRAC_MISSING_TO_SKIP=toreal(val[1]);
00144
else if(val[0]=="discrete_tolerance
")
00145
DISCRETE_TOLERANCE = toreal(val[1]);
00146
else if(val[0]=="uniformize
")
00147
uniformize = toint(val[1]);
00148
else if(val[0]=="frac_enough
")
00149
FRAC_ENOUGH=toreal(val[1]);
00150
else if(val[0]=="precompute
")
00151
precompute = val[1];
00152
else PLERROR("unknown argument: %s
",val[0].c_str());
00153
}
00154
if(source_fn=="")
00155
PLERROR("you must specify source file
");
00156
if(desti_fn=="")
00157
PLERROR("you must specify destination .vmat
");
00158
if(target==-1)
00159
PLERROR("you must specify source target field index
");
00160
00161
// manual map between field index and types
00162
map<int, FieldType> force;
00163
map<int, string> additional_proc;
00164
map<int, int> additional_proc_size;
00165
00166
real beta_hat,student=-1;
00167
real correlation = -1;
00168
00169
VMat vm = getDataSet(source_fn);
00170
00171
// A vector where we store the indices of the fields to be uniformized.
00172
TVec<int> need_to_be_uniformized;
00173
00174
if (target < 0 || target > vm->width()) {
00175
PLERROR("The target column you specified is not valid
");
00176
}
00177
00178
// Compute the result inputsize as the preprocessing goes on.
00179
int inputsize = 0;
00180
00181
cout<<"###
using field
"<<target<<" as target
"<<endl;
00182
00184
// read user custom operation from file 'force_fname'
00185
vector<string> forcelines;
00186
if(force_fn!="")
00187
forcelines = getNonBlankLines(loadFileAsString(force_fn));
00188
for(int i=0; i<(signed)forcelines.size();i++)
00189
{
00190
size_t pos_of_equal = forcelines[i].find('=');
00191
if (pos_of_equal == string::npos)
00192
PLERROR("In
FieldConvertCommand - A line in the force file does not contain the
'=' character
");
00193
vector<string> vec(2);
00194
vec[0] = forcelines[i].substr(0, pos_of_equal);
00195
vec[1] = forcelines[i].substr(pos_of_equal + 1);
00196
/* cout << "vec[0] =
" << vec[0] << endl;
00197
cout << "vec[1] =
" << vec[1] << endl; */
00198
vector<string> leftpart = split(vec[0],"-
");
00199
if (leftpart.size() == 1 && leftpart[0].substr(leftpart[0].size() - 1) == "+
") {
00200
// Syntax: field+="processing
" (number of inputs added)
00201
int field_index = vm->fieldIndex(leftpart[0].substr(0, leftpart[0].size() - 1));
00202
if (field_index == -1)
00203
PLERROR("In
FieldConvertCommand - A field was not found in the source
VMatrix");
00204
if (additional_proc[field_index] != "")
00205
PLERROR("In
FieldConvertCommand - There can be only one additional processing specified
for each field
");
00206
size_t last_open_par = vec[1].rfind('(');
00207
if (last_open_par == string::npos)
00208
PLERROR("In
FieldConvertCommand - You must specify the number of inputs added in a processing
");
00209
string added_inputs = vec[1].substr(last_open_par + 1, vec[1].rfind(')') - last_open_par - 1);
00210
// cout << "added_inputs =
" << added_inputs << endl;
00211
additional_proc_size[field_index] = toint(added_inputs);
00212
size_t first_comma = vec[1].find('"');
00213 size_t last_comma = vec[1].rfind(
'"', last_open_par);
00214 additional_proc[field_index] = vec[1].substr(first_comma + 1, last_comma - first_comma - 1);
00215
00216 }
else {
00217
FieldType rpart = stringToFieldType(vec[1]);
00218
00219
if(leftpart.size()>1)
00220 {
00221
00222
int a =
toint(leftpart[0]);
00223
int b =
toint(leftpart[1]);
00224
00225
for(
int j=a;j<=b;j++)
00226 force[j]=rpart;
00227 }
00228
else
00229 {
00230
if(vm->fieldIndex(vec[0])==-1)
00231 cout<<
"field : "<<vec[0]<<
" doesn't exist in matrix"<<
endl;
00232 force[vm->fieldIndex(vec[0])] = rpart;
00233 }
00234 }
00235 }
00237
00238 TVec<StatsCollector> sc;
00239 sc = vm->getStats();
00240
00241 ofstream* out;
00242 ofstream* out_uni = 0;
00243
string filename_non_uni = desti_fn +
".non_uniformized.vmat";
00244
if (uniformize > 0) {
00245
00246
00247 out =
new ofstream(filename_non_uni.c_str());
00248 out_uni =
new ofstream(desti_fn.c_str());
00249 }
else {
00250 out =
new ofstream(desti_fn.c_str());
00251 }
00252 ofstream report(report_fn.c_str());
00253 *out<<
"<SOURCES>\n"+source_fn+
"\n</SOURCES>\n<PROCESSING>\n";
00254
00255
00256
int n_enough = (
int) (FRAC_ENOUGH * vm->length());
00257
00258
ProgressBar* pb =
new ProgressBar(
"Analyzing fields", vm->width());
00259
00260
00261
for(
int i=0;i<vm->width();i++)
00262 {
00263 type=unknown;
00264 beta_hat=0;
00265
string message;
00266
int action = 0;
00267
int count = (
int)sc[i].getCounts()->size()-1;
00268
00269
00270
if(force.find(i) != force.end())
00271 type = force[i];
00272
else if(i==target)
00273
00274
00275 type=skip;
00276
00277
00278
if(sc[i].nnonmissing() <= (1-FRAC_MISSING_TO_SKIP) * vm->length()) {
00279
if (type != unknown && type != skip && type != constant) {
00280
00281 cout <<
"Warning: you forced the type of field number " << i <<
", "
00282 <<
"but there are too many missing values so it'll be skipped. "
00283 <<
"If you want to keep it, you'll have to add it by hand to the resulting .vmat"
00284 <<
endl;
00285 }
00286 type=skip;
00287 }
00288
00289
00290
00291
00292
if (
count == 2 && type != skip) {
00293
Vec counts(2);
00294
int k = 0;
00295
for(map<real,StatsCollectorCounts>::iterator it = sc[i].getCounts()->begin();
k <= 1; ++it) {
00296 counts[
k++] = it->second.n;
00297 }
00298
if (counts[0] >= n_enough && counts[1] >= n_enough) {
00299
if (type != unknown && type != binary) {
00300 cout <<
"Warning: type for field number " << i <<
" set to binary, "
00301 <<
"but you had forced it to something else." <<
endl;
00302 }
00303 type = binary;
00304 }
else {
00305
00306
if (type != unknown && type != skip) {
00307 cout <<
"Warning: field number " << i <<
" is binary but doesn't have "
00308 <<
"enough representants of each class, thus it'll be skipped, "
00309 <<
"even if you had forced it to some other type (edit the resulting "
00310 <<
".vmat if you really want to add it)." <<
endl;
00311 }
00312 type = skip;
00313
00314
00315 }
00316 }
00317
00318
00319
if(
count<=1 && type != skip && type != constant) {
00320
if(sc[i].nmissing()>0 && sc[i].nmissing()<vm->length()) {
00321
00322
00323 message =
"Constant field, but there are " +
tostring(sc[i].nmissing()) +
00324
" missing values. Force the type, or modify this program !";
00325 }
00326
else {
00327
00328
if (type != unknown) {
00329 cout <<
"Warning: field number " << i <<
" has been forced, but "
00330 <<
"appears to be constant. Edit the resulting .vmat if you "
00331 <<
"really want to add it." <<
endl;
00332 }
00333 type=constant;
00334 }
00335 }
00336
00337
00338
00339
00340
00341
00342
bool may_be_fraction =
false;
00343
if (type == continuous || type == binary) {
00344 may_be_fraction =
true;
00345 }
else if (type != skip && type != constant) {
00346
int k = 0;
00347
for (map<real,StatsCollectorCounts>::iterator it = sc[i].getCounts()->begin();
k <
count; ++it) {
00348
real val = it->first;
00349
k++;
00350
if((
val-(
int)
val) != 0)
00351 {
00352 may_be_fraction =
true;
00353
break;
00354 }
00355 }
00356 }
00357
00358
00359
if (type == unknown && message ==
"")
00360 {
00361
00362
if(sc[i].max()>-1000 && vm->getStringToRealMapping(i).size()>0)
00363 message=
"Field uses both string map & numerical values";
00364
else if(sc[i].min() >= 0 && sc[i].max() >= 12000 && sc[i].max() <= 20000) {
00365
00366
00367
bool non_integer =
false;
00368
for(
int j=0;j<vm->length();j++)
00369 {
00370
real val = vm->get(j,i);
00371
if(!
is_missing(val) && ((
val-(
int)
val) > 0))
00372 non_integer =
true;
00373 }
00374
if (!non_integer) {
00375 message =
"Looks like a numeric SAS date. If this is the case, first edit the source (.vmat) file to change the 'TextFilesVMatrix' field type (use sas_date), then edit force.txt to force the type to continuous. If it's not a date, please use force.txt to force the type.";
00376 }
00377 }
00378
else if(sc[i].min()>19700000 && sc[i].max()<20080000)
00379
00380 message=
"Looks like a date. Edit the source file to change the 'TextFilesVMatrix' field type (use jdate). Otherwise, edit force.txt to force the type.";
00381
00382
00383
else if((
count >=
MIN( UNIQUE_NMISSING_FRACTION_TO_ASSUME_CONTINUOUS * sc[i].nnonmissing(), 2000))
00384 && vm->getStringToRealMapping(i).size()==0)
00385 type=continuous;
00386
else {
00387
00388
if (may_be_fraction) {
00389 type=continuous;
00390 }
00391 }
00392
00393
00394
00395
if(type==unknown && message==
"")
00396 {
00397
00398
real sigma_hat=0,sigma_beta_hat=0;
00399
real xmean = sc[i].mean();
00400
real ymean = sc[target].mean();
00401
real x_minus_xmean_square=0;
00402
real y_minus_ymean_square=0;
00403
00404
int len_nm = 0;
00405
int len = vm->length();
00406
00407
Vec x(len);
00408
Vec y(len);
00409 vm->getColumn(i, x);
00410 vm->getColumn(target, y);
00411
00412
00413
for(
int j=0;j<len;j++)
00414
if(!
is_missing(x[j]) && !
is_missing(y[j]))
00415 {
00416
real xdiff =
x[j] - xmean;
00417
real ydiff = y[j] - ymean;
00418 beta_hat += xdiff * ydiff;
00419 x_minus_xmean_square += xdiff * xdiff;
00420 y_minus_ymean_square += ydiff * ydiff;
00421 len_nm++;
00422 }
00423
00424
00425
correlation = fabs(beta_hat) /
sqrt(x_minus_xmean_square * y_minus_ymean_square);
00426
00427 beta_hat /= x_minus_xmean_square;
00428
00429
00430
for(
int j=0;j<len;j++)
00431
if(!
is_missing(x[j]) && !
is_missing(y[j]))
00432 sigma_hat +=
square(y[j]-ymean - beta_hat*(x[j]-xmean));
00433 sigma_hat /= len_nm-2;
00434
00435 sigma_beta_hat = sigma_hat / x_minus_xmean_square;
00436
00437
real t = beta_hat /
sqrt(sigma_beta_hat);
00438
00439 student = 2 *
student_t_cdf(-fabs(t), len_nm-2);
00440
if(student < PVALUE_THRESHOLD)
00441 {
00442
00443 type = discrete_corr;
00444
00445 }
00446 }
00447
00448
00449
if(type==unknown && message==
"")
00450
00451
if((
real)(sc[i].max()-sc[i].min()+1) > (
real)(
count)*2 ) {
00452 type=continuous;
00453
00454 }
00455
else if((
real)(sc[i].max()-sc[i].min()+1) != (
real)(
count) )
00456 message =
"(edit force.txt): Data is made of a semi-sparse (density<50%) distribution of integers (uncorrelated with target). max: "+
tostring(sc[i].
max())+
" min:"+
tostring(sc[i].
min())+
" count:"+
tostring(count);
00457
else {
00458
00459
00460 type = discrete_uncorr;
00461
00462 }
00463 }
00464
00465
00466
00467
00468
if (type == binary)
00469 type = continuous;
00470
00471
if(type==unknown)
00472 cout<<
tostring(i)+
" ("+vm->fieldName(i)+
") "<<message<<
endl;
00473
else if(type==continuous)
00474 {
00475 action |=
NORMALIZE;
00476
if(sc[i].nmissing()>0)
00477 action |=
MISSING_BIT;
00478 }
00479
else if(type==discrete_uncorr)
00480 {
00481 action =
ONEHOT;
00482
if(sc[i].nmissing()>0)
00483 action |=
MISSING_BIT;
00484 }
00485
else if(type==skip || type==constant)
00486 {
00487 action =
SKIP;
00488 }
00489
else if(type==discrete_corr)
00490 {
00491 action |=
NORMALIZE;
00492 action |=
ONEHOT;
00493
if(sc[i].nmissing()>0)
00494 action |=
MISSING_BIT;
00495 }
00496
00497
00498
00499
if(action&
NORMALIZE)
00500 {
00501
00502 *out <<
"@" << vm->fieldName(i) <<
" ";
00503
00504
if(sc[i].nmissing()>0)
00505 {
00506
00507
double maxi=-1;
00508
real missingval = -1;
00509
for(map<real,StatsCollectorCounts>::iterator it = sc[i].getCounts()->begin(); it!=sc[i].getCounts()->end(); ++it)
00510
if(it->second.n > maxi)
00511 {
00512 maxi=it->second.n;
00513 missingval=it->first;
00514 }
00515
if(maxi<10)
00516
00517 missingval=sc[i].mean();
00518
else {
00519
00520
00521 }
00522
00523 *out <<
"isnan " << missingval <<
" @" << vm->fieldName(i) <<
" ifelse ";
00524 }
00525
00526
00527
bool to_uniformize = (uniformize == 2);
00528
00529
00530
00531
bool apply_normalization =
true;
00532
if (uniformize == 1) {
00533
real max = sc[i].max();
00534
real min = sc[i].min();
00535
real mu = sc[i].mean();
00536
real sigma = sc[i].stddev();
00537
int nsamp = (
int) sc[i].nnonmissing();
00538
real confidence = 0.05;
00539
real alpha =
gauss_01_quantile(
pow((1 - confidence), 1 /
real(nsamp)));
00540
if ( (
max - mu) / sigma > alpha || (
min - mu) / sigma < - alpha) {
00541
00542 to_uniformize =
true;
00543 }
00544 }
00545
if (to_uniformize) {
00546 action ^=
NORMALIZE;
00547 action |=
UNIFORMIZE;
00548 apply_normalization =
false;
00549 *out <<
":" << vm->fieldName(i) <<
endl;
00550 need_to_be_uniformized.append(inputsize);
00551 }
00552
00553
00554
if (apply_normalization) {
00555
real mu = sc[i].mean();
00556
real sigma = sc[i].stddev();
00557 *out << mu <<
" - " << sigma <<
" / :" << vm->fieldName(i)<<
"\n";
00558 }
00559
00560
00561 inputsize++;
00562 }
00563
00564
int n_discarded = 0;
00565
if(action&
ONEHOT) {
00566
00567
00568
int k = 0;
00569 TVec<bool> to_be_included(count);
00570
for (
int j = 0; j <
count; j++) {
00571 to_be_included[j] =
true;
00572 }
00573
for(map<real,StatsCollectorCounts>::iterator it = sc[i].getCounts()->begin();
k<((
int)sc[i].getCounts()->size()) - 1; ++it) {
00574
if (it->second.n < n_enough) {
00575 to_be_included[
k] =
false;
00576 n_discarded++;
00577
00578
00579 }
00580
k++;
00581 }
00582
if (n_discarded <=
count - 1) {
00583
00584
00585
00586
00587
00588
real tol = 0;
00589
if (may_be_fraction) {
00590
00591 tol = DISCRETE_TOLERANCE;
00592 }
00593
RealMapping rm = sc[i].getAllValuesMapping(&to_be_included, 0,
true, tol);
00594 *out <<
"@"<<vm->fieldName(i) <<
" " <<
rm <<
" "
00595 <<
rm.size() <<
" onehot :"
00596 << vm->fieldName(i)<<
"_:0:"<< (
rm.size() - 1) <<
endl;
00597
00598
00599
00600 inputsize +=
count - n_discarded;
00601 }
00602 }
00603
00604
if(action&
MISSING_BIT)
00605 {
00606 *out<<
"@"<<vm->fieldName(i)<<
" isnan 1 0 ifelse :"<<vm->fieldName(i)<<
"_mbit\n";
00607 inputsize++;
00608 }
00609
00610 report<<
tostring(i)+
" ("+vm->fieldName(i)+
") [c="<<
count<<
" nm="<<sc[i].nnonmissing()<<
"] ";
00611
if(action==0)report<<
"~~user intervention required :"<<message;
00612
if(action&
NORMALIZE) {
00613 report <<
"NORMALIZE ";
00614
00615
00616
00617 }
00618
if (action &
UNIFORMIZE) report <<
"UNIFORMIZE ";
00619
if (action&
ONEHOT) report<<
"ONEHOT("<<
count<<
") - discarded: " << n_discarded <<
" ";
00620
if (type==discrete_corr) report<<
"correl: "<<
correlation<<
" 2tail-student:"<<student<<
" ";
00621
if (action&
MISSING_BIT) report<<
"MISSING_BIT ";
00622
if (action&
SKIP) report<<
"SKIP ";
00623
if (additional_proc[i] !=
"") {
00624
00625 *out << additional_proc[i] <<
endl;
00626 inputsize += additional_proc_size[i];
00627 report <<
"ADD_PROC ";
00628 }
00629 report<<
endl;
00630
00631 pb->
update(i);
00632
00633 }
00634
00635
delete pb;
00636
00637
00638 *out <<
"%" << target <<
" :target\n</PROCESSING>"<<
endl;
00639
00640
00641 *out <<
endl <<
"<SIZES>" <<
endl
00642 << inputsize <<
endl
00643 <<
"1" <<
endl
00644 <<
"0" <<
endl
00645 <<
"</SIZES>" <<
endl;
00646
00647
00648
if (uniformize > 0) {
00649
00650
00651
Vec shift(inputsize + 1);
00652
Vec scale(inputsize + 1);
00653 shift.fill(0);
00654 scale.fill(1);
00655
for (
int i = 0; i < need_to_be_uniformized.length(); i++) {
00656 shift[need_to_be_uniformized[i]] = -0.5;
00657 scale[need_to_be_uniformized[i]] = 2;
00658 }
00659
00660 *out_uni <<
"# Preprocessed VMat" <<
endl;
00661 *out_uni <<
"<SOURCES>" <<
endl;
00662 *out_uni <<
"@" <<
endl
00663 <<
"ShiftAndRescaleVMatrix(" <<
endl
00664 <<
" automatic = 0" <<
endl
00665 <<
" shift = [" << shift <<
"]" <<
endl
00666 <<
" scale = [" << scale <<
"]" <<
endl
00667 <<
" underlying_vmat =" <<
endl;
00668 *out_uni <<
" PLearnerOutputVMatrix(" <<
endl;
00669 *out_uni <<
" train_learners = 1" <<
endl;
00670 *out_uni <<
" data = AutoVMatrix(specification = \"" << filename_non_uni <<
"\")" <<
endl;
00671 *out_uni <<
" learners = [" <<
endl;
00672 *out_uni <<
" UniformizeLearner(" <<
endl;
00673 *out_uni <<
" which_fieldnums = ";
00674 *out_uni <<
"[ " << need_to_be_uniformized <<
"]" <<
endl;
00675 *out_uni <<
" )" <<
endl;
00676 *out_uni <<
" ]" <<
endl;
00677 *out_uni <<
" )" <<
endl
00678 <<
")" <<
endl;
00679 *out_uni <<
"</SOURCES>" <<
endl <<
endl;
00680 *out_uni <<
"<SIZES>" <<
endl
00681 << inputsize <<
endl
00682 <<
"1" <<
endl
00683 <<
"0" <<
endl
00684 <<
"</SIZES>" <<
endl;
00685 }
00686
00687
00688
if (precompute !=
"none") {
00689 *out <<
endl <<
"<PRECOMPUTE>" <<
endl << precompute <<
endl <<
"</PRECOMPUTE>" <<
endl;
00690
if (uniformize > 0) {
00691 *out_uni <<
endl <<
"<PRECOMPUTE>" <<
endl << precompute <<
endl <<
"</PRECOMPUTE>" <<
endl;
00692 }
00693 }
00694
00695
00696 out->close();
00697
delete out;
00698
if (uniformize > 0) {
00699 out_uni->close();
00700
delete out_uni;
00701 }
00702
00703 }
00704
00706
00708 PLearn::FieldConvertCommand::FieldType FieldConvertCommand::stringToFieldType(
string s) {
00709
if (s.find(
"continuous") != string::npos)
00710
return continuous;
00711
else if (s.find(
"discrete_uncorr")!= string::npos )
00712
return discrete_uncorr;
00713
else if (s.find(
"discrete_corr") != string::npos)
00714
return discrete_corr;
00715
else if (s.find(
"constant") != string::npos)
00716
return constant;
00717
else if (s.find(
"binary") != string::npos)
00718
return binary;
00719
else if (s.find(
"skip") != string::npos)
00720
return skip;
00721
else {
00722
PLERROR(
"In FieldConvertCommand::stringToFieldType Unknown field type: %s",s.c_str());
00723
return skip;
00724 }
00725 }
00726