00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
#include <plearn/math/TMat_maths.h>
00035
00036
#include "SDBVMat.h"
00037
00038
00039
#ifndef WIN32
00040
#include <plearn/sys/Profiler.h>
00041
#endif
00042
00043
namespace PLearn {
00044
using namespace std;
00045
00046
00047
00048
00049 SDBVMOutputCoder::SDBVMOutputCoder(SDBVMOutputCoding oc,
00050
real missing_values_mapping)
00051 : output_coding_(oc), num_classes_(0),
00052 missing_values_mapping_(missing_values_mapping)
00053 {}
00054
00055 SDBVMOutputCoder::~SDBVMOutputCoder()
00056 {}
00057
00058 SDBVMOutputCoding SDBVMOutputCoder::getOutputCoding()
const
00059
{
00060
return output_coding_;
00061 }
00062
00063 void SDBVMOutputCoder::setNumClasses(
int n_classes)
00064 {
00065
num_classes_ = n_classes;
00066 }
00067
00068 int SDBVMOutputCoder::getNumClasses()
const
00069
{
00070
return num_classes_;
00071 }
00072
00073 void SDBVMOutputCoder::setMissingValuesMapping(
real missing_values_mapping)
00074 {
00075
missing_values_mapping_ = missing_values_mapping;
00076 }
00077
00078 real SDBVMOutputCoder::getMissingValuesMapping()
const
00079
{
00080
return missing_values_mapping_;
00081 }
00082
00083 void SDBVMOutputCoder::setOutput(
real output_value,
const Vec& output_field)
const
00084
{
00085
if (
is_missing(output_value))
00086 output_value =
missing_values_mapping_;
00087
int output_int =
int(output_value);
00088
00089
switch (
output_coding_) {
00090
case SDBVMNumeric:
00091 output_field[0] = output_value;
00092
break;
00093
00094
case SDBVMOneHot:
00095
00096
00097
if (
is_missing(output_value))
00098 output_field.
fill(
MISSING_VALUE);
00099
else {
00100
if (output_value < 0 || output_value >=
num_classes_)
00101
PLERROR(
"In SDBVMOutputCoder::setOutput: "
00102
"Output value not in the range [0,%d]",
num_classes_-1);
00103
00104 output_field.
fill(0.0);
00105 output_field[output_int] = 1.0;
00106 }
00107
break;
00108
00109
case SDBVMOneHotMinus1:
00110
00111
00112
if (
is_missing(output_value))
00113 output_field.
fill(
MISSING_VALUE);
00114
else {
00115
00116
00117
00118
00119
00120
00121 output_field.
fill(0.0);
00122
if (output_value < 0 || output_value >=
num_classes_)
00123
PLERROR(
"In SDBVMOutputCoder::setOutput: "
00124
"Output value not in the range [0,%d]",
num_classes_-1);
00125
if (output_int > 0) {
00126 --output_int;
00127
00128 output_field[output_int] = 1.0;
00129 }
00130 }
00131
break;
00132
00133
default:
00134
PLERROR(
"In SDBVMOutputCoder::setOutput: "
00135
"Unknown coding type: %d",
int(
output_coding_));
00136 }
00137 }
00138
00139 int SDBVMOutputCoder::fieldWidth()
const
00140
{
00141
switch (
output_coding_) {
00142
case SDBVMNumeric:
00143
return 1;
00144
00145
case SDBVMOneHot:
00146
if (
num_classes_ == 0)
00147
PLERROR(
"In SDBVMOutputCoder::fieldWidth: "
00148
"number of output classes not specified");
00149
return num_classes_;
00150
00151
case SDBVMOneHotMinus1:
00152
if (
num_classes_ == 0)
00153
PLERROR(
"In SDBVMOutputCoder::fieldWidth: "
00154
"number of output classes not specified");
00155
return num_classes_ - 1;
00156
00157
default:
00158
PLERROR(
"In SDBVMOutputCoder::fieldWidth: "
00159
"Unknown coding type: %d",
int(
output_coding_));
00160 }
00161
return 0;
00162 }
00163
00164
template <
class Mapping>
00165 int SDBVMOutputCoder::getNumClasses(
const Mapping& mapping,
00166
real other_values_mapping,
00167
real missing_values_mapping)
00168 {
00169
typename Mapping::const_iterator it = mapping.begin(), end = mapping.end();
00170
int max_of_map = INT_MIN;
00171
bool all_int =
true;
00172
00173
00174
for (; it != end; ++it) {
00175
if (all_int && it->second >= 0 && it->second ==
int(it->second))
00176 max_of_map = std::max(max_of_map,
int(it->second));
00177
else
00178 all_int =
false;
00179 }
00180
00181
return handleOtherAndMissing(all_int, max_of_map,
00182 other_values_mapping, missing_values_mapping);
00183 }
00184
00185 int SDBVMOutputCoder::handleOtherAndMissing(
bool all_int,
00186
int candidate_max,
00187
real other_values_mapping,
00188
real missing_values_mapping)
00189 {
00190
00191
00192
if (all_int && finite(other_values_mapping) &&
00193 other_values_mapping ==
int(other_values_mapping))
00194 candidate_max = std::max(candidate_max,
int(other_values_mapping));
00195
else
00196
00197
00198
if (!
is_missing(other_values_mapping))
00199 all_int =
false;
00200
00201
if (all_int && finite(missing_values_mapping) &&
00202 missing_values_mapping == int(missing_values_mapping))
00203 candidate_max = std::max(candidate_max,
int(missing_values_mapping));
00204
else
00205
00206
00207
if (!
is_missing(missing_values_mapping))
00208 all_int =
false;
00209
00210
return all_int? candidate_max+1 : 0;
00211 }
00212
00213
00214
00215
00216 SDBVMatrix::SDBVMatrix(
const string& dbname,
bool detect_missing)
00217 : sdb_(dbname, "",
SDB::readonly), detect_missing_(detect_missing)
00218 {
00219
row_ =
Row(&
sdb_.
getSchema());
00220 length_ =
sdb_.
size();
00221 width_ = 0;
00222
if (
sdb_.
hasStats())
00223
sdb_.
loadStats();
00224
else {
00225
sdb_.
computeStats();
00226
sdb_.
saveStats();
00227 }
00228 }
00229
00230 void SDBVMatrix::appendField(
const string& name,
SDBVMField* new_field)
00231 {
00232
int fieldwidth = new_field->
fieldWidth();
00233
vector<string> fieldnames =
split(name);
00234
if(fieldwidth>1 &&
int(fieldnames.size())==fieldwidth)
00235 {
00236
for(
unsigned int i=0; i<fieldnames.size(); i++)
00237 {
00238 width_++;
00239 declareField(width_-1,fieldnames[i],new_field->
fieldType());
00240 }
00241 }
00242
else
00243 {
00244
for(
int k=0;
k<fieldwidth;
k++)
00245 {
00246 width_++;
00247 declareField(width_-1,name,new_field->
fieldType());
00248 }
00249 }
00250
00251
fields_.push_back(
PSDBVMField(new_field));
00252 current_row.
resize(width_);
00253 }
00254
00255 void SDBVMatrix::getRow(
int i,
Vec v)
const
00256
{
00257
00258
00259
sdb_.
getInRow(i,
row_);
00260
00261 FieldsVector::const_iterator it =
fields_.begin(), end =
fields_.end();
00262
int curpos=0, curwidth;
00263
for (
int f=0; it != end; ++it, curpos+=curwidth, f++) {
00264 curwidth = (*it)->fieldWidth();
00265
Vec output = v.
subVec(curpos,curwidth);
00266
00267 (*it)->convertField(
sdb_,
row_, output);
00268
00269
if (
detect_missing_ && output.
hasMissing())
00270
PLWARNING(
"SDBVMatrix::getRow(%d,v) has missing value for %d-th field (columns %d-%d)",
00271 i,f,curpos,curpos+curwidth-1);
00272 }
00273
00274 }
00275
00276
00277
00278
00279
SDBVMOutputCoding
00280 SDBVMField::getOutputCoding()
const
00281
{
00282
return SDBVMUnknownCoding;
00283 }
00284
00285 void SDBVMField::convertMissing(
const Vec& output)
const
00286
{
00287
for (
int i=0;i<output.
length();i++)
00288
if (
is_missing(output[i]))
00289 output[i]=
missing_values_mapping_;
00290 }
00291
00292
00293
00294
00295 void SDBVMFieldAsIs::convertField(
const SDBWithStats& sdb,
const Row& row,
00296
const Vec& output)
const
00297
{
00298 output[0] =
real(source_.
getValue(sdb,row));
00299 convertMissing(output);
00300 }
00301
00302 int SDBVMFieldAsIs::fieldWidth()
const
00303
{
00304
return 1;
00305 }
00306
00307
00308
00309
00310
00311
00312 void SDBVMFieldNormalize::convertField(
const SDBWithStats& sdb,
00313
const Row& row,
const Vec& output)
const
00314
{
00315
real x = source_.
getValue(sdb,row);
00316
const FieldStat& stat = source_.
getFieldStat(sdb,row);
00317 output[0] = (
x - stat.
mean()) / stat.
stddev();
00318 convertMissing(output);
00319 }
00320
00321 int SDBVMFieldNormalize::fieldWidth()
const
00322
{
00323
return 1;
00324 }
00325
00326
00327
00328
00329
00330
00331 void SDBVMFieldDivSigma::convertField(
const SDBWithStats& sdb,
00332
const Row& row,
const Vec& output)
const
00333
{
00334
real x = source_.
getValue(sdb,row);
00335
const FieldStat& stat = source_.
getFieldStat(sdb,row);
00336 output[0] =
x / stat.
stddev();
00337 convertMissing(output);
00338 }
00339
00340 int SDBVMFieldDivSigma::fieldWidth()
const
00341
{
00342
return 1;
00343 }
00344
00345
00346
00347
00348
00349 void SDBVMFieldAffine::convertField(
const SDBWithStats& sdb,
00350
const Row& row,
const Vec& output)
const
00351
{
00352
real v = source_.
getValue(sdb,row);
00353
00354
00355
00356 output[0] =
a_ * v +
b_;
00357 convertMissing(output);
00358 }
00359
00360 int SDBVMFieldAffine::fieldWidth()
const
00361
{
00362
return 1;
00363 }
00364
00365
00366
00367
00368
00369
00370 void SDBVMFieldPosAffine::convertField(
const SDBWithStats& sdb,
00371
const Row& row,
const Vec& output)
const
00372
{
00373 output[0] =
a_ * std::max(
real(source_.
getValue(sdb,row)),
real(0.)) +
b_;
00374 convertMissing(output);
00375 }
00376
00377 int SDBVMFieldPosAffine::fieldWidth()
const
00378
{
00379
return 1;
00380 }
00381
00382
00383
00384
00385
00386 void SDBVMFieldSignedPower::convertField(
const SDBWithStats& sdb,
00387
const Row& row,
const Vec& output)
const
00388
{
00389
real x = source_.
getValue(sdb,row);
00390
real sign_x =
x>=0.0 ? 1.0 : -1.0;
00391 output[0] = sign_x *
mypow(
x*sign_x,
a_);
00392 convertMissing(output);
00393 }
00394
00395 int SDBVMFieldSignedPower::fieldWidth()
const
00396
{
00397
return 1;
00398 }
00399
00400
00401
00402
00403
00404 void SDBVMFieldFunc1::convertField(
const SDBWithStats& sdb,
00405
const Row& row,
const Vec& output)
const
00406
{
00407
Vec input(1);
00408 input[0] = source_.
getValue(sdb,row);
00409 output <<
func_(input);
00410 convertMissing(output);
00411 }
00412
00413 int SDBVMFieldFunc1::fieldWidth()
const
00414
{
00415
return func_->outputsize;
00416 }
00417
00418
00419
00420
00421
00422
00423 void SDBVMFieldFunc2::convertField(
const SDBWithStats& sdb,
00424
const Row& row,
const Vec& output)
const
00425
{
00426
Vec input1(1), input2(1);
00427 input1[0] =
real(source1_.
getValue(sdb,row));
00428 input2[0] = real(source2_.
getValue(sdb,row));
00429 output[0] =
func_(input1, input2);
00430 convertMissing(output);
00431 }
00432
00433 int SDBVMFieldFunc2::fieldWidth()
const
00434
{
00435
return 1;
00436 }
00437
00438
00439
00440
00441
00442
00443 void SDBVMFieldDate::convertField(
const SDBWithStats& sdb,
00444
const Row& row,
const Vec& output)
const
00445
{
00446
real realval = source_.
getValue(sdb,row);
00447
if(
is_missing(realval)) {
00448 output[0] = missing_values_mapping_;
00449 output[1] = missing_values_mapping_;
00450 output[2] = missing_values_mapping_;
00451 }
00452
else {
00453
PDate d =
float_to_date(realval);
00454 output[0] =
real(d.
year);
00455 output[1] = real(d.
month);
00456 output[2] = real(d.
day);
00457 }
00458 }
00459
00460 int SDBVMFieldDate::fieldWidth()
const
00461
{
00462
return 3;
00463 }
00464
00465
00466
00467
00468
00469
00470 void SDBVMFieldDay::convertField(
const SDBWithStats& sdb,
00471
const Row& row,
const Vec& output)
const
00472
{
00473
real realval = source_.
getValue(sdb,row);
00474
PDate d =
float_to_date(realval);
00475
00476
00477 output[0] = ((d.
year-1990)*365+(d.
month-1)*30+(d.
day-1))/3650.0;
00478 convertMissing(output);
00479 }
00480
00481 int SDBVMFieldDay::fieldWidth()
const
00482
{
00483
return 1;
00484 }
00485
00486
00487
00488
00489
00490 void SDBVMFieldMonths::convertField(
const SDBWithStats& sdb,
00491
const Row& row,
const Vec& output)
const
00492
{
00493
real realval = source_.
getValue(sdb,row);
00494
PDate d =
float_to_date(realval);
00495 output[0] = d.
year*12 + (d.
month-1);
00496 convertMissing(output);
00497 }
00498
00499 int SDBVMFieldMonths::fieldWidth()
const
00500
{
00501
return 1;
00502 }
00503
00504
00505
00506
00507 void SDBVMFieldDateDiff::convertField(
const SDBWithStats& sdb,
00508
const Row& row,
const Vec& output)
const
00509
{
00510
00511
00512
FieldValue v1 = source1_.
getValue(sdb,row);
00513
if (!v1.
isMissing() && !
date1_threshold_.
isMissing() &&
00514 v1 <=
date1_threshold_)
00515 v1.
setMissing();
00516
00517
PDate d1 = v1.
toDate();
00518
00519
FieldValue v2;
00520
PDate d2 =
refdate;
00521 output[0] =
MISSING_VALUE;
00522
00523
if(!d1.
isMissing() && refdate.
isMissing())
00524 {
00525 v2 = source2_.
getValue(sdb,row);
00526
if (!v2.
isMissing() && !
date2_threshold_.
isMissing() &&
00527 v2 <=
date2_threshold_)
00528 v2.
setMissing();
00529
00530
if(v2.
isDate())
00531 d2 = v2.
toDate();
00532
else if(v2.
isIntegral()) {
00533
if (!v2.
isMissing()) {
00534
int value2 =
int(v2);
00535
switch(
unit) {
00536
case 'D':
00537 output[0] = d1.
toJulianDay() - value2;
00538
break;
00539
case 'M':
00540 output[0] = d1.
month - value2;
00541
break;
00542
case 'Y':
00543 output[0] = d1.
year - value2;
00544
break;
00545
default:
00546
PLERROR(
"In SDBVMFieldDateDiff: unrecognized unit %c",
unit);
00547 }
00548
return;
00549 }
00550 }
00551
else
00552
PLERROR(
"In SDBVMFieldDateDiff convertField: second field is neither "
00553
"a date nor an integer type!");
00554 }
00555
00556
00557
if(!d1.
isMissing() && d1.
year<1900)
00558 d1.
setMissing();
00559
if(!d2.
isMissing() && d2.
year<1900)
00560 d2.
setMissing();
00561
if(!d1.
isMissing() && !d2.
isMissing())
00562 {
00563
switch(
unit)
00564 {
00565
case 'D':
00566 output[0] = d1-d2;
00567
break;
00568
case 'M':
00569 output[0] = (d1.
year*12+d1.
month) - (d2.year*12+d2.month);
00570
break;
00571
case 'Y':
00572 output[0] = d1.
year - d2.year;
00573
break;
00574
default:
00575
PLERROR(
"In SDBVMFieldDateDiff: unrecognized unit %c",
unit);
00576 }
00577 }
00578 }
00579
00580 int SDBVMFieldDateDiff::fieldWidth()
const
00581
{
00582
return 1;
00583 }
00584
00585
00586
00587
00588 SDBVMFieldDiscrete::SDBVMFieldDiscrete(
SDBVMSource source,
int num_classes,
00589
real missing_values_mapping, SDBVMOutputCoding oc, VMField::FieldType ft)
00590 :
inherited(source,missing_values_mapping,ft), num_classes_(num_classes),
00591 output_coder_(new
SDBVMOutputCoder(oc, missing_values_mapping))
00592 {
00593
output_coder_->setNumClasses(num_classes);
00594 }
00595
00596
00597 SDBVMFieldDiscrete::SDBVMFieldDiscrete(
SDBVMSource source,
SDBVMOutputCoder* oc,
00598
int num_classes,
real missing_values_mapping, VMField::FieldType ft)
00599 :
inherited(source, missing_values_mapping,ft),
00600 num_classes_(num_classes), output_coder_(oc)
00601 {
00602
output_coder_->setNumClasses(num_classes);
00603 }
00604
00605 void SDBVMFieldDiscrete::convertField(
const SDBWithStats& sdb,
const Row& row,
00606
const Vec& output)
const
00607
{
00608
real value =
getDiscreteValue(sdb, row);
00609
output_coder_->setOutput(value, output);
00610 }
00611
00612 int SDBVMFieldDiscrete::fieldWidth()
const
00613
{
00614
return output_coder_->fieldWidth();
00615 }
00616
00617 SDBVMOutputCoding SDBVMFieldDiscrete::getOutputCoding()
const
00618
{
00619
return output_coder_->getOutputCoding();
00620 }
00621
00622 void SDBVMFieldDiscrete::setNumClasses(
int num_classes)
00623 {
00624
num_classes_ = num_classes;
00625
output_coder_->setNumClasses(num_classes);
00626 }
00627
00628
00629
00630
00631 void SDBVMFieldDateGreater::convertField(
const SDBWithStats& sdb,
00632
const Row& row,
const Vec& output)
const
00633
{
00634
PDate d = source_.
getValue(sdb,row);
00635
00636
if (d>
ref)
00637 output[0]=1;
00638
else
00639 output[0]=0;
00640 }
00641
00642 int SDBVMFieldDateGreater::fieldWidth()
const
00643
{
00644
return 1;
00645 }
00646
00647 real SDBVMFieldDateGreater::getDiscreteValue(
const SDBWithStats& sdb,
const
00648
Row& row)
const
00649
{
00650
00651
FieldValue v = source_.
getValue(sdb,row);
00652
if(v.
isMissing())
00653
return missing_values_mapping_;
00654
return v.
toDate()>
ref ?1 :0;
00655 }
00656
00657
00658
00659 real SDBVMFieldCodeAsIs::getDiscreteValue(
const SDBWithStats& sdb,
00660
const Row& row)
const
00661
{
00662
FieldValue v = source_.
getValue(sdb,row);
00663
return v.
isMissing() ?missing_values_mapping_ :
real(v);
00664 }
00665
00666
00667
00668
00669
SDBVMFieldRemapReals::RealMap
00670 SDBVMFieldRemapReals::getRealMapping(
const string& mappings)
00671 {
00672
RealMap real_mapping;
00673
00674
if(!mappings.empty()) {
00675 istrstream in(mappings.c_str());
00676
real realkey, value;
00677
00678
for(;;) {
00679 in >> realkey >> value;
00680
if (!in)
00681
break;
00682 real_mapping[realkey] = value;
00683 }
00684 }
00685
return real_mapping;
00686 }
00687
00688 SDBVMFieldRemapReals::SDBVMFieldRemapReals(
SDBVMSource source,
00689
const string& mappings,
00690
real other_values_mapping,
00691
real missing_values_mapping,
00692 SDBVMOutputCoding oc,
00693 VMField::FieldType ft)
00694 :
inherited(source, 0, missing_values_mapping, oc, ft),
00695 real_mapping_(getRealMapping(mappings)),
00696 other_values_mapping_(other_values_mapping)
00697 {
00698
00699 setNumClasses(SDBVMOutputCoder::getNumClasses(
00700
real_mapping_, other_values_mapping, missing_values_mapping));
00701 }
00702
00703 SDBVMFieldRemapReals::SDBVMFieldRemapReals(
SDBVMSource source,
00704
const FieldStat& field_stat,
00705
real other_values_mapping,
00706
real missing_values_mapping,
00707 SDBVMOutputCoding oc,
00708 VMField::FieldType ft)
00709 :
inherited(source, 0, missing_values_mapping, oc, ft),
00710 other_values_mapping_(other_values_mapping)
00711 {
00712 map<string,int>::iterator it = field_stat.
symbolid.begin(),
00713 end = field_stat.
symbolid.end();
00714
for( ; it != end; ++it)
00715
real_mapping_[
real(
todouble(it->first))] = it->second;
00716
00717
00718 setNumClasses(SDBVMOutputCoder::getNumClasses(
00719
real_mapping_, other_values_mapping, missing_values_mapping));
00720 }
00721
00722 real SDBVMFieldRemapReals::getDiscreteValue(
const SDBWithStats& sdb,
00723
const Row& row)
const
00724
{
00725
FieldValue v = source_.
getValue(sdb,row);
00726
if(v.
isMissing())
00727
return missing_values_mapping_;
00728
00729
real realval =
real(v);
00730 RealMap::const_iterator found =
real_mapping_.find(realval);
00731
if (found !=
real_mapping_.end())
00732 realval = found->second;
00733
else
00734 realval =
other_values_mapping_;
00735
return realval;
00736 }
00737
00738
00739
00740
00741
SDBVMFieldRemapStrings::StringMap
00742 SDBVMFieldRemapStrings::getStringMapping(
const string& mappings)
00743 {
00744
StringMap string_mapping;
00745
00746
if(!mappings.empty()) {
00747 istrstream in(mappings.c_str());
00748
string stringkey;
00749
real value;
00750
00751
for(;;) {
00752 in >> stringkey >> value;
00753
if (!in)
00754
break;
00755 string_mapping[stringkey] = value;
00756 }
00757 }
00758
return string_mapping;
00759 }
00760
00761 SDBVMFieldRemapStrings::SDBVMFieldRemapStrings(
SDBVMSource source,
00762
const string& mappings,
00763
real other_values_mapping,
00764
real missing_values_mapping,
00765 SDBVMOutputCoding oc,
00766 VMField::FieldType ft)
00767 :
inherited(source, 0, missing_values_mapping, oc, ft),
00768 string_mapping_(getStringMapping(mappings)),
00769 other_values_mapping_(other_values_mapping)
00770 {
00771
00772 setNumClasses(SDBVMOutputCoder::getNumClasses(
00773
string_mapping_, other_values_mapping, missing_values_mapping));
00774 }
00775
00776 SDBVMFieldRemapStrings::SDBVMFieldRemapStrings(
SDBVMSource source,
00777
const FieldStat& field_stat,
00778
real other_values_mapping,
00779
real missing_values_mapping,
00780 SDBVMOutputCoding oc,
00781 VMField::FieldType ft)
00782 :
inherited(source, 0, missing_values_mapping, oc, ft),
00783 other_values_mapping_(other_values_mapping)
00784 {
00785 map<string,int>::iterator it = field_stat.
symbolid.begin(),
00786 end = field_stat.
symbolid.end();
00787
for( ; it != end; ++it)
00788
string_mapping_[it->first] = it->second;
00789
00790
00791 setNumClasses(SDBVMOutputCoder::getNumClasses(
00792
string_mapping_, other_values_mapping, missing_values_mapping));
00793 }
00794
00795 real SDBVMFieldRemapStrings::getDiscreteValue(
const SDBWithStats& sdb,
00796
const Row& row)
const
00797
{
00798
real realval;
00799
FieldValue v = source_.
getValue(sdb,row);
00800
if(v.
isMissing())
00801
00802
00803
00804
00805
00806
return missing_values_mapping_;
00807
string s =
space_to_underscore(
string(v));
00808 StringMap::const_iterator found =
string_mapping_.find(s);
00809
if (found !=
string_mapping_.end())
00810 realval = found->second;
00811
else
00812 realval =
other_values_mapping_;
00813
return realval;
00814 }
00815
00816
00817
00818
00819
void
00820 SDBVMFieldRemapIntervals::getIntervals(
const string& mappings,
00821
bool& all_int,
real& max_of_map,
00822
Vec& intervals_x,
Vec& intervals_y)
00823 {
00824 all_int =
true;
00825 max_of_map = -FLT_MAX;
00826 istrstream in(mappings.c_str());
00827
real xi, yi;
00828 intervals_x.
resize(10);
00829 intervals_y.
resize(11);
00830
00831
int i;
00832
for(i=0; ; ++i) {
00833 in >> yi;
00834
if(!in)
00835
PLERROR(
"In NGSDBVMFieldRemapIntervals::getIntervals: "
00836
"mappings should have an odd number of elements, found %d",
00837 2*i);
00838 intervals_y[i] = yi;
00839
if (all_int && yi >= 0 && yi ==
int(yi))
00840 max_of_map = std::max(max_of_map, yi);
00841
else
00842 all_int =
false;
00843
00844 in >> xi;
00845
if(!in)
00846
break;
00847
if (i>0 && intervals_x[i-1]>=xi)
00848
PLERROR(
"In NGSDBVMFieldRemapIntervals::getIntervals: "
00849
"mappings needs x_{i-1}<x_i, found x[%d]=%f, x[%d]=%f",
00850 i-1,intervals_x[i-1],i,xi);
00851 intervals_x[i] = xi;
00852
if (intervals_x.length()==i+1) {
00853 intervals_x.resize(2*i);
00854 intervals_y.
resize(2*i+1);
00855 }
00856 }
00857
00858
00859
00860
00861
00862
00863
00864 intervals_x.
resize(i);
00865 intervals_y.
resize(i+1);
00866 }
00867
00868 SDBVMFieldRemapIntervals::SDBVMFieldRemapIntervals(
SDBVMSource source,
00869
const string& mappings,
00870
real other_values_mapping,
00871
real missing_values_mapping,
00872 SDBVMOutputCoding oc,
00873 VMField::FieldType ft)
00874 :
inherited(source, 0, missing_values_mapping, oc, ft),
00875 other_values_mapping_(other_values_mapping)
00876 {
00877
real max_of_map;
00878
bool all_int;
00879
getIntervals(mappings, all_int, max_of_map,
intervals_x_,
intervals_y_);
00880
00881
00882 setNumClasses(SDBVMOutputCoder::handleOtherAndMissing(
00883 all_int,
int(max_of_map), other_values_mapping, missing_values_mapping));
00884 }
00885
00886 real SDBVMFieldRemapIntervals::getDiscreteValue(
const SDBWithStats& sdb,
00887
const Row& row)
const
00888
{
00889
FieldValue v = source_.
getValue(sdb,row);
00890
if(v.
isMissing())
00891
return missing_values_mapping_;
00892
else
00893
return intervals_y_[1+
int(
binary_search(
intervals_x_,
real(v)))];
00894 }
00895
00896
00897
00898
00899 SDBVMFieldMultiDiscrete::SDBVMFieldMultiDiscrete(
const FieldArray& fields,
00900
real missing_values_mapping,
00901 SDBVMOutputCoding oc,
00902 VMField::FieldType ft)
00903 :
inherited(
FieldPtr() , 0,
00904 missing_values_mapping, oc, ft)
00905 {
00906
setFields(fields);
00907 }
00908
00909 void SDBVMFieldMultiDiscrete::setFields(
const FieldArray& fields)
00910 {
00911
fields_ = fields;
00912
int n = fields.
size();
00913
field_multipliers_.
resize(n);
00914 setNumClasses(0);
00915
00916
00917
if (n>0) {
00918
int prod = 1;
00919
field_multipliers_[n-1] = 1;
00920
for (
int i=n-2; i>=0; --i) {
00921 prod *= fields[i+1]->getNumClasses();
00922
field_multipliers_[i] = prod;
00923 }
00924 prod *= fields[0]->getNumClasses();
00925 setNumClasses(prod);
00926 }
00927 }
00928
00929 real SDBVMFieldMultiDiscrete::getDiscreteValue(
const SDBWithStats& sdb,
00930
const Row& row)
const
00931
{
00932
int index = 0;
00933
int n =
fields_.
size();
00934
for (
int i=0; i<n; ++i) {
00935
real value =
fields_[i]->getDiscreteValue(sdb,row);
00936
if (value !=
int(value) || value < 0)
00937
PLERROR(
"SDBVMFieldMultiDiscrete::getDiscreteValue: negative or "
00938
"non-integer value %f returned for field %d", value, i);
00939 index +=
int(value)*int(
field_multipliers_[i]);
00940 }
00941
return real(index);
00942 }
00943
00944
00945
00946
00947
00948
00949
00950
00951
00952
00953 SDBVMFieldICBCTargets::SDBVMFieldICBCTargets(
Schema schema,
bool use_roadstar,
00954
bool add_claims_sum_column,
bool rescale_by_interval,
00955
bool rescale_by_start_date,
Mat& start_date_rescaling_values,
const string& targetname)
00956 :
inherited(0,
VMField::Continuous), use_roadstar_(use_roadstar),
00957 add_claims_sum_column_(add_claims_sum_column),
00958 rescale_by_interval_(rescale_by_interval),
00959 rescale_by_start_date_(rescale_by_start_date),
00960 start_date_rescaling_values_(start_date_rescaling_values),
00961 targetname_(targetname),
00962 start_date_(schema("policy_start_date")),
00963 end_date_(schema("policy_end_date")),
00964 bodily_injury_incurred_((targetname_=="ALL" || targetname_=="bodily_injury_incurred") ?
00965 schema("bodily_injury_incurred") :
FieldPtr()),
00966 property_damage_incurred_((targetname_=="ALL" || targetname_=="sum_all_but_BI"
00967 || targetname_=="property_damage_incurred") ?
00968 schema("property_damage_incurred") :
FieldPtr()),
00969 accident_death_incurred_((targetname_=="ALL" || targetname_=="sum_all_but_BI"
00970 || targetname_=="accident_death_incurred") ?
00971 schema("accident_death_incurred") :
FieldPtr()),
00972 collision_lou_incurred_((targetname_=="ALL" || targetname_=="sum_all_but_BI"
00973 || targetname_=="collision_lou_incurred") ?
00974 schema("collision_lou_incurred") :
FieldPtr()),
00975 comprehensive_incurred_((targetname_=="ALL" || targetname_=="sum_all_but_BI"
00976 || targetname_=="comprehensive_incurred") ?
00977 schema("comprehensive_incurred") :
FieldPtr()),
00978 bodily_injury_count_((targetname_=="ALLcounts" || targetname_=="bodily_injury_count") ?
00979 schema("bodily_injury_count") :
FieldPtr()),
00980 property_damage_count_((targetname_=="ALLcounts" || targetname_=="all_counts_but_BI"
00981 || targetname_=="property_damage_count") ?
00982 schema("property_damage_count") :
FieldPtr()),
00983 accident_death_count_((targetname_=="ALLcounts" || targetname_=="all_counts_but_BI"
00984 || targetname_=="accident_death_count") ?
00985 schema("accident_death_count") :
FieldPtr()),
00986 collision_lou_count_((targetname_=="ALLcounts" || targetname_=="all_counts_but_BI"
00987 || targetname_=="collision_lou_count") ?
00988 schema("collision_lou_count") :
FieldPtr()),
00989 comprehensive_count_((targetname_=="ALLcounts" || targetname_=="all_counts_but_BI"
00990 || targetname_=="comprehensive_count") ?
00991 schema("comprehensive_count") :
FieldPtr()),
00992 bodily_injury_severity_((targetname_=="ALLseverities" || targetname_=="bodily_injury_severity") ?
00993 schema("bodily_injury_severity") :
FieldPtr()),
00994 property_damage_severity_((targetname_=="ALLseverities" || targetname_=="all_severities_but_BI"
00995 || targetname_=="property_damage_severity") ?
00996 schema("property_damage_severity") :
FieldPtr()),
00997 accident_death_severity_((targetname_=="ALLseverities" || targetname_=="all_severities_but_BI"
00998 || targetname_=="accident_death_severity") ?
00999 schema("accident_death_severity") :
FieldPtr()),
01000 collision_lou_severity_((targetname_=="ALLseverities" || targetname_=="all_severities_but_BI"
01001 || targetname_=="collision_lou_severity") ?
01002 schema("collision_lou_severity") :
FieldPtr()),
01003 comprehensive_severity_((targetname_=="ALLseverities" || targetname_=="all_severities_but_BI"
01004 || targetname_=="comprehensive_severity") ?
01005 schema("comprehensive_severity") :
FieldPtr())
01006
01007
01008
01009 {
01010
reference_start_date_year_ =
rescale_by_start_date_ ? (
int)
start_date_rescaling_values_[0][0] : 0;
01011
reference_start_date_month_ =
rescale_by_start_date_ ? (
int)
start_date_rescaling_values_[0][1] : 0;
01012 }
01013
01014 void SDBVMFieldICBCTargets::convertField(
const SDBWithStats& sdb,
01015
const Row& row,
const Vec& output)
const
01016
{
01017
int i = 0;
01018
PDate start_date =
start_date_.
getValue(sdb,row).
toDate();
01019
PDate end_date =
end_date_.
getValue(sdb,row).
toDate();
01020
01021
01022
01023
if (start_date.
year<1970) start_date.
year = end_date.
year-1;
01024
01025
real normalization = 0.001;
01026
real duration = (end_date - start_date)/365.25;
01027
01028
if (
rescale_by_interval_) {
01029
if (
is_missing(duration) || duration<=0)
01030 {
01031 cout <<
"start_date = " << start_date <<
endl;
01032 cout <<
"end_date = " << end_date <<
endl;
01033
PLERROR(
"SDBVMFieldICBCTargets: incorrect dates");
01034 }
01035 normalization = 0.001/duration;
01036 }
01037
if (
rescale_by_start_date_) {
01038
int row_index = (start_date.
year -
reference_start_date_year_ - 1) * 12
01039 + 12 -
reference_start_date_month_ + start_date.
month;
01040
if (
targetname_==
"ALL" ||
targetname_==
"bodily_injury_incurred")
01041 {
01042
if (
start_date_rescaling_values_[row_index][2] == 0)
01043
PLERROR(
"Trying to divide by zero");
01044 output[i++] =
real(
bodily_injury_incurred_.
getValue(sdb,row))*normalization /
01045
start_date_rescaling_values_[row_index][2];
01046 }
01047
if (
targetname_==
"ALL" ||
targetname_==
"property_damage_incurred")
01048 {
01049
if (
start_date_rescaling_values_[row_index][3] == 0)
01050
PLERROR(
"Trying to divide by zero");
01051 output[i++] =
real(
property_damage_incurred_.
getValue(sdb,row))*normalization /
01052
start_date_rescaling_values_[row_index][3];
01053 }
01054
if (
targetname_==
"ALL" ||
targetname_==
"accident_death_incurred")
01055 {
01056
if (
start_date_rescaling_values_[row_index][4] == 0)
01057
PLERROR(
"Trying to divide by zero");
01058 output[i++] =
real(
accident_death_incurred_.
getValue(sdb,row))*normalization /
01059
start_date_rescaling_values_[row_index][4];
01060 }
01061
if (
targetname_==
"ALL" ||
targetname_==
"collision_lou_incurred")
01062 {
01063
if (
start_date_rescaling_values_[row_index][5] == 0)
01064
PLERROR(
"Trying to divide by zero");
01065 output[i++] =
real(
collision_lou_incurred_.
getValue(sdb,row))*normalization /
01066
start_date_rescaling_values_[row_index][5];
01067 }
01068
if (
targetname_==
"ALL" ||
targetname_==
"comprehensive_incurred")
01069 {
01070
if (
start_date_rescaling_values_[row_index][6] == 0)
01071
PLERROR(
"Trying to divide by zero");
01072 output[i++] =
real(
comprehensive_incurred_.
getValue(sdb,row))*normalization /
01073
start_date_rescaling_values_[row_index][6];
01074 }
01075
if (
targetname_==
"sum_all_but_BI")
01076 {
01077
if (
start_date_rescaling_values_[row_index][3] == 0 ||
01078
start_date_rescaling_values_[row_index][4] == 0 ||
01079
start_date_rescaling_values_[row_index][5] == 0 ||
01080
start_date_rescaling_values_[row_index][6] == 0)
01081
PLERROR(
"Trying to divide by zero");
01082 output[i++] = (
real(
property_damage_incurred_.
getValue(sdb,row))/
01083
start_date_rescaling_values_[row_index][3] +
01084 real(
accident_death_incurred_.
getValue(sdb,row))/
01085
start_date_rescaling_values_[row_index][4] +
01086 real(
collision_lou_incurred_.
getValue(sdb,row))/
01087
start_date_rescaling_values_[row_index][5] +
01088 real(
comprehensive_incurred_.
getValue(sdb,row))/
01089
start_date_rescaling_values_[row_index][6])*normalization;
01090 }
01091 }
01092
else {
01093
if (
targetname_==
"ALL" ||
targetname_==
"bodily_injury_incurred")
01094 output[i++] =
real(
bodily_injury_incurred_.
getValue(sdb,row))*normalization;
01095
if (
targetname_==
"ALL" ||
targetname_==
"property_damage_incurred")
01096 output[i++] = real(
property_damage_incurred_.
getValue(sdb,row))*normalization;
01097
if (
targetname_==
"ALL" ||
targetname_==
"accident_death_incurred")
01098 output[i++] = real(
accident_death_incurred_.
getValue(sdb,row))*normalization;
01099
if (
targetname_==
"ALL" ||
targetname_==
"collision_lou_incurred")
01100 output[i++] = real(
collision_lou_incurred_.
getValue(sdb,row))*normalization;
01101
if (
targetname_==
"ALL" ||
targetname_==
"comprehensive_incurred")
01102 output[i++] = real(
comprehensive_incurred_.
getValue(sdb,row))*normalization;
01103
if (
targetname_==
"sum_all_but_BI")
01104 output[i++] = (real(
property_damage_incurred_.
getValue(sdb,row))+
01105 real(
accident_death_incurred_.
getValue(sdb,row))+
01106 real(
collision_lou_incurred_.
getValue(sdb,row))+
01107 real(
comprehensive_incurred_.
getValue(sdb,row)))*normalization;
01108
01109
01110
if (
targetname_==
"ALLcounts" ||
targetname_==
"bodily_injury_count")
01111 output[i++] = real(
bodily_injury_count_.
getValue(sdb,row));
01112
if (
targetname_==
"ALLcounts" ||
targetname_==
"all_counts_but_BI" ||
targetname_==
"property_damage_count")
01113 output[i++] = real(
property_damage_count_.
getValue(sdb,row));
01114
if (
targetname_==
"ALLcounts" ||
targetname_==
"all_counts_but_BI" ||
targetname_==
"accident_death_count")
01115 output[i++] = real(
accident_death_count_.
getValue(sdb,row));
01116
if (
targetname_==
"ALLcounts" ||
targetname_==
"all_counts_but_BI" ||
targetname_==
"collision_lou_count")
01117 output[i++] = real(
collision_lou_count_.
getValue(sdb,row));
01118
if (
targetname_==
"ALLcounts" ||
targetname_==
"all_counts_but_BI" ||
targetname_==
"comprehensive_count")
01119 output[i++] = real(
comprehensive_count_.
getValue(sdb,row));
01120
01121
01122
01123
if (
targetname_==
"ALLseverities" ||
targetname_==
"bodily_injury_severity")
01124 {
01125
int n =
int(real(
bodily_injury_count_.
getValue(sdb,row)));
01126 output[i++] = n>0? real(
bodily_injury_incurred_.
getValue(sdb,row)) / n: 0;
01127 }
01128
if (
targetname_==
"ALLseverities" ||
targetname_==
"all_severities_but_BI" ||
targetname_==
"property_damage_severity")
01129 {
01130
int n =
int(real(
property_damage_count_.
getValue(sdb,row)));
01131 output[i++] = n>0? real(
property_damage_incurred_.
getValue(sdb,row)) / n: 0;
01132 }
01133
if (
targetname_==
"ALLseverities" ||
targetname_==
"all_severities_but_BI" ||
targetname_==
"accident_death_severity")
01134 {
01135
int n =
int(real(
accident_death_count_.
getValue(sdb,row)));
01136 output[i++] = n>0? real(
accident_death_incurred_.
getValue(sdb,row)) / n: 0;
01137 }
01138
if (
targetname_==
"ALLseverities" ||
targetname_==
"all_severities_but_BI" ||
targetname_==
"collision_lou_severity")
01139 {
01140
int n =
int(real(
collision_lou_count_.
getValue(sdb,row)));
01141 output[i++] = n>0? real(
collision_lou_incurred_.
getValue(sdb,row)) / n: 0;
01142 }
01143
if (
targetname_==
"ALLseverities" ||
targetname_==
"all_severities_but_BI" ||
targetname_==
"comprehensive_severity")
01144 {
01145
int n =
int(real(
comprehensive_count_.
getValue(sdb,row)));
01146 output[i++] = n>0? real(
comprehensive_incurred_.
getValue(sdb,row)) / n: 0;
01147 }
01148
01149
01150
if (
targetname_==
"ALLseverities")
01151 output[i++] = real(
bodily_injury_count_.
getValue(sdb,row))>0? 1 : 0;
01152
if (
targetname_==
"ALLseverities" ||
targetname_==
"all_severities_but_BI")
01153 {
01154 output[i++] = real(
property_damage_count_.
getValue(sdb,row))>0? 1 : 0;
01155 output[i++] = real(
accident_death_count_.
getValue(sdb,row))>0? 1 : 0;
01156 output[i++] = real(
collision_lou_count_.
getValue(sdb,row))>0? 1 : 0;
01157 output[i++] = real(
comprehensive_count_.
getValue(sdb,row))>0? 1 : 0;
01158 }
01159
01160 }
01161
if (
add_claims_sum_column_) {
01162
if (
targetname_==
"ALL")
01163 output[i] = output[0] + output[1] + output[2] + output[3] + output[4];
01164
else
01165 output[i] = output[0];
01166 i++;
01167 }
01168
if (
rescale_by_interval_)
01169 output[i] = duration;
01170
01171 convertMissing(output);
01172 }
01173
01174
01175
01176
01177 void SDBVMFieldHasClaim::convertField(
const SDBWithStats& sdb,
01178
const Row& row,
Vec& output)
const
01179
{
01180
real a,b,c,d,e,f;
01181 a =
real(row.
bind(
bodily_injury_incurred_).toDouble());
01182 b = real(row.
bind(
property_damage_incurred_).toDouble());
01183 c = real(row.
bind(
accident_death_incurred_).toDouble());
01184 d = real(row.
bind(
collision_lou_incurred_).toDouble());
01185 e = real(row.
bind(
comprehensive_incurred_).toDouble());
01186 f = real(row.
bind(
roadstar_incurred_).toDouble());
01187 output[0] = (a!=0) || (b!=0) || (c!=0) || (d!=0) || (e!=0) || (f!=0);
01188
01189 convertMissing(output);
01190 }
01191
01192
01193
01194 void SDBVMFieldSumClaims::convertField(
const SDBWithStats& sdb,
01195
const Row& row,
Vec& output)
const
01196
{
01197
real a,b,c,d,e,f;
01198 a =
real(row.
bind(
bodily_injury_incurred_).toDouble());
01199 b = real(row.
bind(
property_damage_incurred_).toDouble());
01200 c = real(row.
bind(
accident_death_incurred_).toDouble());
01201 d = real(row.
bind(
collision_lou_incurred_).toDouble());
01202 e = real(row.
bind(
comprehensive_incurred_).toDouble());
01203 f = real(row.
bind(
roadstar_incurred_).toDouble());
01204 output[0] = a+b+c+d+e+f;
01205
01206 convertMissing(output);
01207 }
01208
01209
01210 int ICBCpartition(
const Vec& claims,
real threshold)
01211 {
01212
bool flag_big = 0;
01213
bool flag_pos = 0;
01214
bool flag_neg = 0;
01215
01216
for (
int j=0; j<claims.
length(); j++)
01217 {
01218
if (claims[j]>threshold) {flag_big=1;}
01219
else if (claims[j]>0) {flag_pos=1;}
01220
else if (claims[j]<0) {flag_neg=1;}
01221 }
01222
01223
if (flag_big)
return 3;
01224
else if (flag_pos)
return 2;
01225
else if (flag_neg)
return 0;
01226
else return 1;
01227 }
01228
01229
01230
01231
01232 SDBVMFieldICBCClassification::SDBVMFieldICBCClassification(
Schema schema,
const string& fieldname,
const string& tmap_file)
01233 :
inherited(0),
01234 bodily_injury_incurred_(schema("bodily_injury_incurred")),
01235 property_damage_incurred_(schema("property_damage_incurred")),
01236 accident_death_incurred_(schema("accident_death_incurred")),
01237 collision_lou_incurred_(schema("collision_lou_incurred")),
01238 comprehensive_incurred_(schema("comprehensive_incurred")),
01239 policy_start_date_(schema("policy_start_date")),
01240 fieldname_(fieldname)
01241 {
01242
01243
01244
if (fieldname ==
"")
01245
threshold = 10000;
01246
else if (fieldname ==
"condprob3")
01247
threshold = 10000;
01248
else if (fieldname ==
"bodily_injury_incurred")
01249
threshold = 50000;
01250
else if (fieldname ==
"property_damage_incurred")
01251
threshold = 4000;
01252
else if (fieldname ==
"accident_death_incurred")
01253
threshold = 12000;
01254
else if (fieldname ==
"collision_lou_incurred")
01255
threshold = 5000;
01256
else if (fieldname ==
"comprehensive_incurred")
01257
threshold = 1000;
01258
else if (fieldname ==
"sum_all_but_BI")
01259
threshold = 10000;
01260
else
01261
PLERROR(
"Bad field fieldname");
01262 }
01263
01264 void SDBVMFieldICBCClassification::convertField(
const SDBWithStats& sdb,
01265
const Row& row,
const Vec& output)
const
01266
{
01267
int threshold = 10000;
01268
Vec claims(1);
01269
if (
fieldname_ ==
"")
01270 {
01271 claims.
resize(5);
01272 claims[0] =
real(
bodily_injury_incurred_.
getValue(sdb,row));
01273 claims[1] = real(
property_damage_incurred_.
getValue(sdb,row));
01274 claims[2] = real(
accident_death_incurred_.
getValue(sdb,row));
01275 claims[3] = real(
collision_lou_incurred_.
getValue(sdb,row));
01276 claims[4] = real(
comprehensive_incurred_.
getValue(sdb,row));
01277 }
01278
01279
else if (
fieldname_ ==
"condprob3")
01280 {
01281 claims[0] =
real(
property_damage_incurred_.
getValue(sdb,row))
01282 + real(
accident_death_incurred_.
getValue(sdb,row))
01283 + real(
collision_lou_incurred_.
getValue(sdb,row))
01284 + real(
comprehensive_incurred_.
getValue(sdb,row));
01285 output[0]=claims[0]<=0?0:1;
01286
01287
return;
01288 }
01289
01290
else if (
fieldname_ ==
"bodily_injury_incurred")
01291 claims[0] =
real(
bodily_injury_incurred_.
getValue(sdb,row));
01292
else if (
fieldname_ ==
"property_damage_incurred")
01293 claims[0] = real(
property_damage_incurred_.
getValue(sdb,row));
01294
else if (
fieldname_ ==
"accident_death_incurred")
01295 claims[0] = real(
accident_death_incurred_.
getValue(sdb,row));
01296
else if (
fieldname_ ==
"collision_lou_incurred")
01297 claims[0] = real(
collision_lou_incurred_.
getValue(sdb,row));
01298
else if (
fieldname_ ==
"comprehensive_incurred")
01299 claims[0] = real(
comprehensive_incurred_.
getValue(sdb,row));
01300
else if (
fieldname_ ==
"sum_all_but_BI")
01301 claims[0] = real(
property_damage_incurred_.
getValue(sdb,row)) +
01302 real(
accident_death_incurred_.
getValue(sdb,row)) +
01303 real(
collision_lou_incurred_.
getValue(sdb,row)) +
01304 real(
comprehensive_incurred_.
getValue(sdb,row));
01305 output[0] =
ICBCpartition(claims, threshold);
01306
01307 }
01308
01309
01310
01311
01312
01313
01314
01315
01316
01317
01318
01319
01320
01321 }