00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00044
#include "DatedJoinVMatrix.h"
00045
#include <plearn/base/PDate.h>
00046
00047
namespace PLearn {
00048
using namespace std;
00049
00050
00051 DatedJoinVMatrix::DatedJoinVMatrix()
00052 :
inherited(),master_date_field_index(-1),slave_date_interval_start_field_index(-1),
00053 slave_date_interval_end_field_index(-1), verbosity(0), output_the_slave(false), output_matching_index(false)
00054 {
00055 }
00056
00057
PLEARN_IMPLEMENT_OBJECT(
DatedJoinVMatrix,
00058
"Join two vmatrices, taking into account a date field.",
00059
"The two vmatrices play an asymmetric role. They are called\n"
00060
"master and slave. The resulting vmatrix has one row for each row\n"
00061
"of the master vmatrix (or optionally of the slave vmatrix). Its\n"
00062
"columns are a concatenation of selected columns of the master vmatrix\n"
00063
"and of selected columns of the slave which 'match' according to a rule\n"
00064
"(always in the order: master fields, slave fields). Matchint is\n"
00065
"obtained using shared 'key fields'. Optionally, for matching, a date field\n"
00066
"in the master is forced to belong to a date interval in the slave,\n"
00067
"as follows: slave_date_start < master_date <= slave_date_end.\n"
00068
"If no match is found then the master (or slave) columns are left with missing values.\n"
00069
"If more than one slave row matches, then the one with the latest\n"
00070
"slave_date_start is used (and a warning is optionally issued). If\n"
00071
"no slave_date_start field is provided then no date constraint is\n"
00072
"enforced, and the last key-matching slave row is matched to a master row.\n"
00073
"An option (output_the_slave) allows to output one row for each slave row\n"
00074
"instead of the default which outputs one row for each master row.\n"
00075
"Note that if (output_the_slave) then the non-matching master rows are 'lost'\n"
00076
"whereas if (!output_the_slave) then the non-matching slave rows are 'lost'.\n"
00077
"If output_the_slave and more than one master row matches with a given slave_row\n"
00078
"then the SUM of the master fields is computed (i.e. be careful that their sum is meaningful)\n"
00079 );
00080
00081 void DatedJoinVMatrix::getNewRow(
int i,
const Vec& v)
const
00082
{
00083
if (!
master || !
slave ||
slave_key_indices.
length()==0)
00084
PLERROR(
"DatedJoinVMatrix: object was not build properly!");
00085 list<int> master_index;
00086
int slave_index=-1;
00087
if (
output_the_slave)
00088 {
00089 slave_index = i;
00090 master_index =
slave2master[i];
00091
if (
output_matching_index)
00092 v[0] = *(master_index.begin());
00093 }
00094
else
00095 {
00096 master_index.
push_back(i);
00097 slave_index =
master2slave[i];
00098
if (
output_matching_index)
00099 v[0] = slave_index;
00100 }
00101
00102
Vec master_part = v.
subVec(
output_matching_index,
n_master_fields);
00103
Vec slave_part = v.
subVec(
n_master_fields+
output_matching_index,
n_slave_fields);
00104
00105
if (master_index.size()>0)
00106 {
00107 list<int>::const_iterator b_it = master_index.begin();
00108 list<int>::const_iterator e_it = master_index.end();
00109 master_part.
clear();
00110
for (list<int>::const_iterator it=b_it;it!=e_it;++it)
00111 {
00112
00113
master->getRow(*it,
master_row);
00114
if (
master_field_indices.
size()>0)
00115
for (
int j=0;j<
master_field_indices.
size();j++)
00116 master_part[j] +=
master_row[
master_field_indices[j]];
00117
else
00118 master_part += master_row;
00119 }
00120 }
00121
else
00122 master_part.
fill(
MISSING_VALUE);
00123
00124
if (slave_index>=0)
00125 {
00126
00127
slave->getRow(slave_index,
slave_row);
00128
if (
slave_field_indices.
size()>0)
00129
for (
int j=0;j<
slave_field_indices.
size();j++)
00130 slave_part[j] =
slave_row[
slave_field_indices[j]];
00131
else
00132 slave_part << slave_row;
00133 }
00134
else
00135 slave_part.
fill(
MISSING_VALUE);
00136
00137 }
00138
00139 void DatedJoinVMatrix::declareOptions(
OptionList& ol)
00140 {
00141
declareOption(ol,
"master", &DatedJoinVMatrix::master, OptionBase::buildoption,
00142
"Master vmatrix, whose columns are directly copied in the result.");
00143
00144
declareOption(ol,
"slave", &DatedJoinVMatrix::slave, OptionBase::buildoption,
00145
"Slave vmatrix, of which only some columns are copied, when the\n"
00146
"key fields and the dates match.");
00147
00148
declareOption(ol,
"master_key_indices", &DatedJoinVMatrix::master_key_indices,
00149 OptionBase::buildoption,
00150
"Indices of the 'key' fields in the master vmatrix. It is not necessary\n"
00151
"to specify them if the master_key_names are given or if the slave_key_names\n"
00152
"are specified (in that case they are assumed to be the same)\n"
00153 );
00154
00155
declareOption(ol,
"master_key_names", &DatedJoinVMatrix::master_key_names,
00156 OptionBase::buildoption,
00157
"Names of the 'key' fields in the master vmatrix. They should not be\n"
00158
"specified if the master_key_indices are given directly. If not provided\n"
00159
"and if the slave_key_names are specified they are assumed to be the same.\n"
00160 );
00161
00162
declareOption(ol,
"slave_key_indices", &DatedJoinVMatrix::slave_key_indices,
00163 OptionBase::buildoption,
00164
"Indices of the 'key' fields in the slave vmatrix. It is not necessary\n"
00165
"to specify them if the slave_key_names are given or if the master_key_names\n"
00166
"are specified (in that case they are assumed to be the same)\n"
00167 );
00168
00169
declareOption(ol,
"slave_key_names", &DatedJoinVMatrix::slave_key_names,
00170 OptionBase::buildoption,
00171
"Names of the 'key' fields in the slave vmatrix. They should not be\n"
00172
"specified if the slave_key_indices are given directly. If not provided\n"
00173
"and if the master_key_names are specified they are assumed to be the same.\n"
00174 );
00175
00176
declareOption(ol,
"slave_field_indices", &DatedJoinVMatrix::slave_field_indices,
00177 OptionBase::buildoption,
00178
"Indices of the fields in the slave vmatrix to be copied in result. It is not necessary\n"
00179
"to specify them if the slave_field_names are given.\n"
00180
"N.B. IF NEITHER slave_field_indices NOR slave_field_names are given then it is assumed\n"
00181
"ALL slave fields should be copied on output.\n"
00182 );
00183
00184
declareOption(ol,
"slave_field_names", &DatedJoinVMatrix::slave_field_names,
00185 OptionBase::buildoption,
00186
"Names of the fields in the slave vmatrix to be copied in result. It is not necessary\n"
00187
"to specify them if the slave_field_indices are given.\n"
00188
"N.B. IF NEITHER slave_field_indices NOR slave_field_names are given then it is assumed\n"
00189
"ALL slave fields should be copied on output.\n"
00190 );
00191
00192
declareOption(ol,
"master_field_indices", &DatedJoinVMatrix::master_field_indices,
00193 OptionBase::buildoption,
00194
"Indices of the fields in the master vmatrix to be copied in result. It is not necessary\n"
00195
"to specify them if the slave_field_names are given.\n"
00196
"N.B. IF NEITHER master_field_indices NOR master_field_names are given then it is assumed\n"
00197
"ALL master fields should be copied on output.\n"
00198 );
00199
00200
declareOption(ol,
"master_field_names", &DatedJoinVMatrix::master_field_names,
00201 OptionBase::buildoption,
00202
"Names of the fields in the slave vmatrix to be copied in result. It is not necessary\n"
00203
"to specify them if the slave_field_indices are given.\n"
00204
"N.B. IF NEITHER master_field_indices NOR master_field_names are given then it is assumed\n"
00205
"ALL master fields should be copied on output.\n"
00206 );
00207
00208
declareOption(ol,
"master_date_field_index", &DatedJoinVMatrix::master_date_field_index,
00209 OptionBase::buildoption,
00210
"Index of the date field in the master vmatrix. Should not be specified\n"
00211
"if the master_date_field_name is given.\n"
00212 );
00213
00214
declareOption(ol,
"master_date_field_name", &DatedJoinVMatrix::master_date_field_name,
00215 OptionBase::buildoption,
00216
"Name of the date field in the master vmatrix. Should not be specified\n"
00217
"if the master_date_field_index is given.\n"
00218 );
00219
00220
declareOption(ol,
"slave_date_interval_start_field_index",
00221 &DatedJoinVMatrix::slave_date_interval_start_field_index,
00222 OptionBase::buildoption,
00223
"Index of the date interval start field in the slave vmatrix.\n"
00224
"Should not be specified if the slave_date_interval_start_field_name is given.\n"
00225 );
00226
00227
declareOption(ol,
"slave_date_interval_start_field_name",
00228 &DatedJoinVMatrix::slave_date_interval_start_field_name,
00229 OptionBase::buildoption,
00230
"Name of the date interval start field in the slave vmatrix.\n"
00231
"Should not be specified if the slave_date_interval_start_field_index is given.\n"
00232 );
00233
00234
declareOption(ol,
"slave_date_interval_end_field_index",
00235 &DatedJoinVMatrix::slave_date_interval_end_field_index,
00236 OptionBase::buildoption,
00237
"Index of the date interval end field in the slave vmatrix.\n"
00238
"Should not be specified if the slave_date_interval_end_field_name is given.\n"
00239 );
00240
00241
declareOption(ol,
"slave_date_interval_end_field_name",
00242 &DatedJoinVMatrix::slave_date_interval_end_field_name,
00243 OptionBase::buildoption,
00244
"Name of the date interval end field in the slave vmatrix.\n"
00245
"Should not be specified if the slave_date_interval_end_field_index is given.\n"
00246 );
00247
00248
declareOption(ol,
"verbosity", &DatedJoinVMatrix::verbosity,
00249 OptionBase::buildoption,
00250
"0: no warning issued,\n"
00251
"1: warning issued if more than one slave row matches,\n"
00252
"2: details about these matches are printed\n"
00253 );
00254
00255
declareOption(ol,
"output_the_slave", &DatedJoinVMatrix::output_the_slave,
00256 OptionBase::buildoption,
00257
"If true than output the SLAVE rows (with master_fields_* from matching master row)\n"
00258
"instead of the MASTER rows (with slave_fields_* from the matching slave row)\n"
00259 );
00260
00261
declareOption(ol,
"output_matching_index", &DatedJoinVMatrix::output_matching_index,
00262 OptionBase::buildoption,
00263
"If true than output an extra variable 'matching_index' which contains the row\n"
00264
"index of the matching slave row (if !output_the_slave) or matching master row\n"
00265
"if (output_the_slave).\n"
00266 );
00267
00268
00269 inherited::declareOptions(ol);
00270 }
00271
00272 void DatedJoinVMatrix::build_()
00273 {
00274
if (
master &&
slave)
00275 {
00276
00277
00278
if (
master_key_names.
length()>0)
00279 {
00280
master_key_indices.
resize(
master_key_names.
length());
00281
for (
int i=0;i<
master_key_names.
length();i++)
00282
master_key_indices[i] =
master->
getFieldIndex(
master_key_names[i]);
00283 }
00284
else if (
master_key_indices.
length()==0)
00285 {
00286
if (
slave_key_names.
length()>0)
00287 {
00288
master_key_indices.
resize(
slave_key_names.
length());
00289
for (
int i=0;i<
slave_key_names.
length();i++)
00290
master_key_indices[i] =
master->
getFieldIndex(
slave_key_names[i]);
00291 }
00292
else PLERROR(
"DatedJoinVMatrix: No key names were provided and no master_key_indices were provided!");
00293 }
00294
00295
if (
slave_key_names.
length()>0)
00296 {
00297
slave_key_indices.
resize(
slave_key_names.
length());
00298
for (
int i=0;i<
slave_key_names.
length();i++)
00299
slave_key_indices[i] =
slave->
getFieldIndex(
slave_key_names[i]);
00300 }
00301
else if (
slave_key_indices.
length()==0)
00302 {
00303
if (
master_key_names.
length()>0)
00304 {
00305
slave_key_indices.
resize(
master_key_names.
length());
00306
for (
int i=0;i<
master_key_names.
length();i++)
00307
slave_key_indices[i] =
slave->
getFieldIndex(
master_key_names[i]);
00308 }
00309
else PLERROR(
"DatedJoinVMatrix: No key names were provided and no slave_key_indices were provided!");
00310 }
00311
00312
if (
slave_field_names.
length()>0)
00313 {
00314
slave_field_indices.
resize(
slave_field_names.
length());
00315
for (
int i=0;i<
slave_field_names.
length();i++)
00316
slave_field_indices[i] =
slave->
getFieldIndex(
slave_field_names[i]);
00317 }
00318
00319
if (
master_field_names.
length()>0)
00320 {
00321
master_field_indices.
resize(
master_field_names.
length());
00322
for (
int i=0;i<
master_field_names.
length();i++)
00323
master_field_indices[i] =
master->
getFieldIndex(
master_field_names[i]);
00324 }
00325
00326
if (
master_date_field_name!=
"")
00327
master_date_field_index =
master->
getFieldIndex(
master_date_field_name);
00328
else if (
master_date_field_index<0)
00329
PLWARNING(
"DatedJoinVMatrix: No master_date_field_name was provided and no master_date_field_index was provided!");
00330
00331
if (
slave_date_interval_start_field_name!=
"")
00332
slave_date_interval_start_field_index =
slave->
getFieldIndex(
slave_date_interval_start_field_name);
00333
else if (slave_date_interval_start_field_index<0 && master_date_field_index>=0)
00334
PLERROR(
"DatedJoinVMatrix: No slave_date_interval_start_field_name was provided and no slave_date_interval_start_field_index was provided!");
00335
00336
if (
slave_date_interval_end_field_name!=
"")
00337
slave_date_interval_end_field_index =
slave->
getFieldIndex(
slave_date_interval_end_field_name);
00338
else if (slave_date_interval_end_field_index<0 && master_date_field_index>=0)
00339
PLERROR(
"DatedJoinVMatrix: No slave_date_interval_end_field_name was provided and no slave_date_interval_end_field_index was provided!");
00340
00341
00342
ProgressBar* pb=
new ProgressBar(
"DatedJoinVMatrix: indexing the slave.",
slave.
length());
00343
key.
resize(
slave_key_indices.
length());
00344
slave_row.
resize(
slave.
width());
00345
master_row.
resize(
master.
width());
00346
for (
int i=0;i<
slave.
length();i++)
00347 {
00348
slave->getRow(i,
slave_row);
00349
for (
int j=0;j<
slave_key_indices.
size();j++)
00350
key[j] =
slave_row[
slave_key_indices[j]];
00351
mp.insert(make_pair(
key,i));
00352 pb->
update(i);
00353 }
00354
delete pb;
00355
00356
00357
if (
master_field_indices.
size()>0)
00358
n_master_fields =
master_field_indices.
size();
00359
else
00360
n_master_fields =
master->
width();
00361
if (
slave_field_indices.
size()>0)
00362
n_slave_fields =
slave_field_indices.
size();
00363
else
00364
n_slave_fields =
slave->
width();
00365 width_ =
output_matching_index +
n_master_fields +
n_slave_fields;
00366
if (
output_the_slave)
00367 length_ =
slave.
length();
00368
else
00369 length_ =
master.
length();
00370
00372 fieldinfos.
resize(width_);
00373
Array<VMField> master_infos =
master->getFieldInfos();
00374
Array<VMField> slave_infos =
slave->getFieldInfos();
00375
if (
output_matching_index)
00376 fieldinfos[0].name=
"matching_index";
00377
if (master_infos.
size() > 0)
00378 {
00379
if (
master_field_indices.
size()>0)
00380
for (
int i=0; i<
n_master_fields; ++i)
00381 fieldinfos[
output_matching_index+i] = master_infos[
master_field_indices[i]];
00382
else
00383
for (
int i=0; i<n_master_fields; ++i)
00384 fieldinfos[
output_matching_index+i] = master_infos[i];
00385 }
00386
if (slave_infos.
size() > 0)
00387 {
00388
if (
slave_field_indices.
size()>0)
00389
for (
int i=0; i<
slave_field_indices.
size(); ++i)
00390 {
00391
VMField f=slave_infos[
slave_field_indices[i]];
00392
if ((
master_field_indices.
size()==0 &&
master->fieldIndex(f.
name)>=0)
00393 ||
master_field_names.
contains(f.
name))
00394 f.
name =
"slave." + f.
name;
00395 fieldinfos[
output_matching_index+
n_master_fields+i] = f;
00396 }
00397
else
00398
for (
int i=0; i<n_slave_fields; ++i)
00399 {
00400
VMField f=slave_infos[i];
00401
if ((
master_field_indices.
size()==0 &&
master->fieldIndex(f.
name)>=0)
00402 ||
master_field_names.
contains(f.
name))
00403 f.
name =
"slave." + f.
name;
00404 fieldinfos[
output_matching_index+
n_master_fields+i] = f;
00405 }
00406 }
00407 pb=
new ProgressBar(
"DatedJoinVMatrix: matching the master and slave.",
master->
length());
00408
00409
master2slave.
resize(
master->
length());
00410
master2slave.
fill(-1);
00411
slave2master.
resize(
slave->
length());
00412
for (
int i=0;i<
master->
length();i++)
00413 {
00414
master->getRow(i,
master_row);
00415
00416
for (
int j=0;j<
master_key_indices.
size();j++)
00417
key[j] =
master_row[
master_key_indices[j]];
00418
00419
00420 Maptype::const_iterator it,low,upp;
00421 pair<Maptype::const_iterator,Maptype::const_iterator> matches=
mp.equal_range(
key);
00422 low=matches.first;
00423 upp=matches.second;
00424
if (low!=
mp.end())
00425 {
00426
PDate master_date;
00427
if (
master_date_field_index>=0)
00428 master_date =
float_to_date(master_row[
master_date_field_index]);
00429
PDate latest_match;
00430
int n_matches=0;
00431
static TVec<int> matches;
00432
if (
verbosity>1) matches.
resize(0);
00433
int matching_slave_row_index = -1;
00434
00435
for (it=low;it!=upp;++it)
00436 {
00437
slave->getRow(it->second,
slave_row);
00438
if (master_date_field_index>=0)
00439 {
00440
PDate slave_date_interval_start =
float_to_date(
slave_row[
slave_date_interval_start_field_index]);
00441
PDate slave_date_interval_end =
float_to_date(
slave_row[
slave_date_interval_end_field_index]);
00442
if (master_date>slave_date_interval_start && master_date<=slave_date_interval_end)
00443 {
00444
if (n_matches==0 || slave_date_interval_start > latest_match)
00445 {
00446 latest_match = slave_date_interval_start;
00447 matching_slave_row_index = it->second;
00448 }
00449 n_matches++;
00450
if (
verbosity>1) matches.
append(it->second);
00451 }
00452 }
else
00453 {
00454 matching_slave_row_index = it->second;
00455 n_matches++;
00456
if (
verbosity>1) matches.
append(it->second);
00457 }
00458 }
00459
if (matching_slave_row_index>=0)
00460 {
00461
master2slave[i] = matching_slave_row_index;
00462
slave2master[matching_slave_row_index].
push_back(i);
00463 }
00464
if (n_matches>1 &&
verbosity>0)
00465 {
00466
PLWARNING(
"DatedJointVMatrix:getRow(%d,.) matched more than once\n",i);
00467
if (
verbosity >1)
00468
for (
int j=0;j<n_matches;j++)
00469 cerr <<
"master row " << i <<
" matched slave row " << matches[j] <<
endl;
00470 }
00471 }
00472 pb->
update(i);
00473 }
00474
delete pb;
00475 }
00476 }
00477
00478
00479 void DatedJoinVMatrix::build()
00480 {
00481 inherited::build();
00482
build_();
00483 }
00484
00485 void DatedJoinVMatrix::makeDeepCopyFromShallowCopy(map<const void*, void*>& copies)
00486 {
00487 inherited::makeDeepCopyFromShallowCopy(copies);
00488
00489
deepCopyField(
slave_row, copies);
00490
deepCopyField(
key, copies);
00491
deepCopyField(
master2slave, copies);
00492
deepCopyField(
slave2master, copies);
00493
deepCopyField(
master, copies);
00494
deepCopyField(
slave, copies);
00495
deepCopyField(
master_key_indices, copies);
00496
deepCopyField(
slave_key_indices, copies);
00497
deepCopyField(
master_key_names, copies);
00498
deepCopyField(
slave_key_names, copies);
00499
deepCopyField(
slave_field_indices, copies);
00500
deepCopyField(
slave_field_names, copies);
00501 }
00502
00503 }
00504