Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

DatedJoinVMatrix.cc

Go to the documentation of this file.
00001 // -*- C++ -*- 00002 00003 // DatedJoinVMatrix.cc 00004 // 00005 // Copyright (C) 2004 *Yoshua Bengio* 00006 // 00007 // Redistribution and use in source and binary forms, with or without 00008 // modification, are permitted provided that the following conditions are met: 00009 // 00010 // 1. Redistributions of source code must retain the above copyright 00011 // notice, this list of conditions and the following disclaimer. 00012 // 00013 // 2. Redistributions in binary form must reproduce the above copyright 00014 // notice, this list of conditions and the following disclaimer in the 00015 // documentation and/or other materials provided with the distribution. 00016 // 00017 // 3. The name of the authors may not be used to endorse or promote 00018 // products derived from this software without specific prior written 00019 // permission. 00020 // 00021 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00022 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00023 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00024 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00025 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00026 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00027 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00028 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00029 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00030 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00031 // 00032 // This file is part of the PLearn library. For more information on the PLearn 00033 // library, go to the PLearn Web site at www.plearn.org 00034 00035 /* ******************************************************* 00036 * $Id: DatedJoinVMatrix.cc,v 1.11 2004/07/21 16:30:55 chrish42 Exp $ 00037 ******************************************************* */ 00038 00039 // Authors: *Yoshua Bengio* 00040 00044 #include "DatedJoinVMatrix.h" 00045 #include <plearn/base/PDate.h> 00046 00047 namespace PLearn { 00048 using namespace std; 00049 00050 00051 DatedJoinVMatrix::DatedJoinVMatrix() 00052 :inherited(),master_date_field_index(-1),slave_date_interval_start_field_index(-1), 00053 slave_date_interval_end_field_index(-1), verbosity(0), output_the_slave(false), output_matching_index(false) 00054 { 00055 } 00056 00057 PLEARN_IMPLEMENT_OBJECT(DatedJoinVMatrix, 00058 "Join two vmatrices, taking into account a date field.", 00059 "The two vmatrices play an asymmetric role. They are called\n" 00060 "master and slave. The resulting vmatrix has one row for each row\n" 00061 "of the master vmatrix (or optionally of the slave vmatrix). Its\n" 00062 "columns are a concatenation of selected columns of the master vmatrix\n" 00063 "and of selected columns of the slave which 'match' according to a rule\n" 00064 "(always in the order: master fields, slave fields). Matchint is\n" 00065 "obtained using shared 'key fields'. Optionally, for matching, a date field\n" 00066 "in the master is forced to belong to a date interval in the slave,\n" 00067 "as follows: slave_date_start < master_date <= slave_date_end.\n" 00068 "If no match is found then the master (or slave) columns are left with missing values.\n" 00069 "If more than one slave row matches, then the one with the latest\n" 00070 "slave_date_start is used (and a warning is optionally issued). If\n" 00071 "no slave_date_start field is provided then no date constraint is\n" 00072 "enforced, and the last key-matching slave row is matched to a master row.\n" 00073 "An option (output_the_slave) allows to output one row for each slave row\n" 00074 "instead of the default which outputs one row for each master row.\n" 00075 "Note that if (output_the_slave) then the non-matching master rows are 'lost'\n" 00076 "whereas if (!output_the_slave) then the non-matching slave rows are 'lost'.\n" 00077 "If output_the_slave and more than one master row matches with a given slave_row\n" 00078 "then the SUM of the master fields is computed (i.e. be careful that their sum is meaningful)\n" 00079 ); 00080 00081 void DatedJoinVMatrix::getNewRow(int i, const Vec& v) const 00082 { 00083 if (!master || !slave || slave_key_indices.length()==0) // etc... 00084 PLERROR("DatedJoinVMatrix: object was not build properly!"); 00085 list<int> master_index; 00086 int slave_index=-1; 00087 if (output_the_slave) 00088 { 00089 slave_index = i; 00090 master_index = slave2master[i]; 00091 if (output_matching_index) 00092 v[0] = *(master_index.begin()); 00093 } 00094 else 00095 { 00096 master_index.push_back(i); 00097 slave_index = master2slave[i]; 00098 if (output_matching_index) 00099 v[0] = slave_index; 00100 } 00101 00102 Vec master_part = v.subVec(output_matching_index,n_master_fields); 00103 Vec slave_part = v.subVec(n_master_fields+output_matching_index,n_slave_fields); 00104 00105 if (master_index.size()>0) 00106 { 00107 list<int>::const_iterator b_it = master_index.begin(); 00108 list<int>::const_iterator e_it = master_index.end(); 00109 master_part.clear(); 00110 for (list<int>::const_iterator it=b_it;it!=e_it;++it) 00111 { 00112 // copy the master fields 00113 master->getRow(*it,master_row); 00114 if (master_field_indices.size()>0) 00115 for (int j=0;j<master_field_indices.size();j++) 00116 master_part[j] += master_row[master_field_indices[j]]; 00117 else 00118 master_part += master_row; 00119 } 00120 } 00121 else 00122 master_part.fill(MISSING_VALUE); 00123 00124 if (slave_index>=0) 00125 { 00126 // copy the slave fields 00127 slave->getRow(slave_index,slave_row); 00128 if (slave_field_indices.size()>0) 00129 for (int j=0;j<slave_field_indices.size();j++) 00130 slave_part[j] = slave_row[slave_field_indices[j]]; 00131 else 00132 slave_part << slave_row; 00133 } 00134 else 00135 slave_part.fill(MISSING_VALUE); 00136 00137 } 00138 00139 void DatedJoinVMatrix::declareOptions(OptionList& ol) 00140 { 00141 declareOption(ol, "master", &DatedJoinVMatrix::master, OptionBase::buildoption, 00142 "Master vmatrix, whose columns are directly copied in the result."); 00143 00144 declareOption(ol, "slave", &DatedJoinVMatrix::slave, OptionBase::buildoption, 00145 "Slave vmatrix, of which only some columns are copied, when the\n" 00146 "key fields and the dates match."); 00147 00148 declareOption(ol, "master_key_indices", &DatedJoinVMatrix::master_key_indices, 00149 OptionBase::buildoption, 00150 "Indices of the 'key' fields in the master vmatrix. It is not necessary\n" 00151 "to specify them if the master_key_names are given or if the slave_key_names\n" 00152 "are specified (in that case they are assumed to be the same)\n" 00153 ); 00154 00155 declareOption(ol, "master_key_names", &DatedJoinVMatrix::master_key_names, 00156 OptionBase::buildoption, 00157 "Names of the 'key' fields in the master vmatrix. They should not be\n" 00158 "specified if the master_key_indices are given directly. If not provided\n" 00159 "and if the slave_key_names are specified they are assumed to be the same.\n" 00160 ); 00161 00162 declareOption(ol, "slave_key_indices", &DatedJoinVMatrix::slave_key_indices, 00163 OptionBase::buildoption, 00164 "Indices of the 'key' fields in the slave vmatrix. It is not necessary\n" 00165 "to specify them if the slave_key_names are given or if the master_key_names\n" 00166 "are specified (in that case they are assumed to be the same)\n" 00167 ); 00168 00169 declareOption(ol, "slave_key_names", &DatedJoinVMatrix::slave_key_names, 00170 OptionBase::buildoption, 00171 "Names of the 'key' fields in the slave vmatrix. They should not be\n" 00172 "specified if the slave_key_indices are given directly. If not provided\n" 00173 "and if the master_key_names are specified they are assumed to be the same.\n" 00174 ); 00175 00176 declareOption(ol, "slave_field_indices", &DatedJoinVMatrix::slave_field_indices, 00177 OptionBase::buildoption, 00178 "Indices of the fields in the slave vmatrix to be copied in result. It is not necessary\n" 00179 "to specify them if the slave_field_names are given.\n" 00180 "N.B. IF NEITHER slave_field_indices NOR slave_field_names are given then it is assumed\n" 00181 "ALL slave fields should be copied on output.\n" 00182 ); 00183 00184 declareOption(ol, "slave_field_names", &DatedJoinVMatrix::slave_field_names, 00185 OptionBase::buildoption, 00186 "Names of the fields in the slave vmatrix to be copied in result. It is not necessary\n" 00187 "to specify them if the slave_field_indices are given.\n" 00188 "N.B. IF NEITHER slave_field_indices NOR slave_field_names are given then it is assumed\n" 00189 "ALL slave fields should be copied on output.\n" 00190 ); 00191 00192 declareOption(ol, "master_field_indices", &DatedJoinVMatrix::master_field_indices, 00193 OptionBase::buildoption, 00194 "Indices of the fields in the master vmatrix to be copied in result. It is not necessary\n" 00195 "to specify them if the slave_field_names are given.\n" 00196 "N.B. IF NEITHER master_field_indices NOR master_field_names are given then it is assumed\n" 00197 "ALL master fields should be copied on output.\n" 00198 ); 00199 00200 declareOption(ol, "master_field_names", &DatedJoinVMatrix::master_field_names, 00201 OptionBase::buildoption, 00202 "Names of the fields in the slave vmatrix to be copied in result. It is not necessary\n" 00203 "to specify them if the slave_field_indices are given.\n" 00204 "N.B. IF NEITHER master_field_indices NOR master_field_names are given then it is assumed\n" 00205 "ALL master fields should be copied on output.\n" 00206 ); 00207 00208 declareOption(ol, "master_date_field_index", &DatedJoinVMatrix::master_date_field_index, 00209 OptionBase::buildoption, 00210 "Index of the date field in the master vmatrix. Should not be specified\n" 00211 "if the master_date_field_name is given.\n" 00212 ); 00213 00214 declareOption(ol, "master_date_field_name", &DatedJoinVMatrix::master_date_field_name, 00215 OptionBase::buildoption, 00216 "Name of the date field in the master vmatrix. Should not be specified\n" 00217 "if the master_date_field_index is given.\n" 00218 ); 00219 00220 declareOption(ol, "slave_date_interval_start_field_index", 00221 &DatedJoinVMatrix::slave_date_interval_start_field_index, 00222 OptionBase::buildoption, 00223 "Index of the date interval start field in the slave vmatrix.\n" 00224 "Should not be specified if the slave_date_interval_start_field_name is given.\n" 00225 ); 00226 00227 declareOption(ol, "slave_date_interval_start_field_name", 00228 &DatedJoinVMatrix::slave_date_interval_start_field_name, 00229 OptionBase::buildoption, 00230 "Name of the date interval start field in the slave vmatrix.\n" 00231 "Should not be specified if the slave_date_interval_start_field_index is given.\n" 00232 ); 00233 00234 declareOption(ol, "slave_date_interval_end_field_index", 00235 &DatedJoinVMatrix::slave_date_interval_end_field_index, 00236 OptionBase::buildoption, 00237 "Index of the date interval end field in the slave vmatrix.\n" 00238 "Should not be specified if the slave_date_interval_end_field_name is given.\n" 00239 ); 00240 00241 declareOption(ol, "slave_date_interval_end_field_name", 00242 &DatedJoinVMatrix::slave_date_interval_end_field_name, 00243 OptionBase::buildoption, 00244 "Name of the date interval end field in the slave vmatrix.\n" 00245 "Should not be specified if the slave_date_interval_end_field_index is given.\n" 00246 ); 00247 00248 declareOption(ol, "verbosity", &DatedJoinVMatrix::verbosity, 00249 OptionBase::buildoption, 00250 "0: no warning issued,\n" 00251 "1: warning issued if more than one slave row matches,\n" 00252 "2: details about these matches are printed\n" 00253 ); 00254 00255 declareOption(ol, "output_the_slave", &DatedJoinVMatrix::output_the_slave, 00256 OptionBase::buildoption, 00257 "If true than output the SLAVE rows (with master_fields_* from matching master row)\n" 00258 "instead of the MASTER rows (with slave_fields_* from the matching slave row)\n" 00259 ); 00260 00261 declareOption(ol, "output_matching_index", &DatedJoinVMatrix::output_matching_index, 00262 OptionBase::buildoption, 00263 "If true than output an extra variable 'matching_index' which contains the row\n" 00264 "index of the matching slave row (if !output_the_slave) or matching master row\n" 00265 "if (output_the_slave).\n" 00266 ); 00267 00268 // Now call the parent class' declareOptions 00269 inherited::declareOptions(ol); 00270 } 00271 00272 void DatedJoinVMatrix::build_() 00273 { 00274 if (master && slave) // we can't really build if we don't have them 00275 { 00276 // convert field names into indices 00277 // * get master key indices 00278 if (master_key_names.length()>0) 00279 { 00280 master_key_indices.resize(master_key_names.length()); 00281 for (int i=0;i<master_key_names.length();i++) 00282 master_key_indices[i] = master->getFieldIndex(master_key_names[i]); 00283 } 00284 else if (master_key_indices.length()==0) 00285 { 00286 if (slave_key_names.length()>0) 00287 { 00288 master_key_indices.resize(slave_key_names.length()); 00289 for (int i=0;i<slave_key_names.length();i++) 00290 master_key_indices[i] = master->getFieldIndex(slave_key_names[i]); 00291 } 00292 else PLERROR("DatedJoinVMatrix: No key names were provided and no master_key_indices were provided!"); 00293 } 00294 // * get slave key indices 00295 if (slave_key_names.length()>0) 00296 { 00297 slave_key_indices.resize(slave_key_names.length()); 00298 for (int i=0;i<slave_key_names.length();i++) 00299 slave_key_indices[i] = slave->getFieldIndex(slave_key_names[i]); 00300 } 00301 else if (slave_key_indices.length()==0) 00302 { 00303 if (master_key_names.length()>0) 00304 { 00305 slave_key_indices.resize(master_key_names.length()); 00306 for (int i=0;i<master_key_names.length();i++) 00307 slave_key_indices[i] = slave->getFieldIndex(master_key_names[i]); 00308 } 00309 else PLERROR("DatedJoinVMatrix: No key names were provided and no slave_key_indices were provided!"); 00310 } 00311 // * get slave field indices 00312 if (slave_field_names.length()>0) 00313 { 00314 slave_field_indices.resize(slave_field_names.length()); 00315 for (int i=0;i<slave_field_names.length();i++) 00316 slave_field_indices[i] = slave->getFieldIndex(slave_field_names[i]); 00317 } 00318 // * get master field indices 00319 if (master_field_names.length()>0) 00320 { 00321 master_field_indices.resize(master_field_names.length()); 00322 for (int i=0;i<master_field_names.length();i++) 00323 master_field_indices[i] = master->getFieldIndex(master_field_names[i]); 00324 } 00325 // * get master date field index 00326 if (master_date_field_name!="") 00327 master_date_field_index = master->getFieldIndex(master_date_field_name); 00328 else if (master_date_field_index<0) 00329 PLWARNING("DatedJoinVMatrix: No master_date_field_name was provided and no master_date_field_index was provided!"); 00330 // * get slave date interval start field index 00331 if (slave_date_interval_start_field_name!="") 00332 slave_date_interval_start_field_index = slave->getFieldIndex(slave_date_interval_start_field_name); 00333 else if (slave_date_interval_start_field_index<0 && master_date_field_index>=0) 00334 PLERROR("DatedJoinVMatrix: No slave_date_interval_start_field_name was provided and no slave_date_interval_start_field_index was provided!"); 00335 // * get slave date interval end field index 00336 if (slave_date_interval_end_field_name!="") 00337 slave_date_interval_end_field_index = slave->getFieldIndex(slave_date_interval_end_field_name); 00338 else if (slave_date_interval_end_field_index<0 && master_date_field_index>=0) 00339 PLERROR("DatedJoinVMatrix: No slave_date_interval_end_field_name was provided and no slave_date_interval_end_field_index was provided!"); 00340 00341 // INDEX THE SLAVE 00342 ProgressBar* pb=new ProgressBar("DatedJoinVMatrix: indexing the slave.",slave.length()); 00343 key.resize(slave_key_indices.length()); 00344 slave_row.resize(slave.width()); 00345 master_row.resize(master.width()); 00346 for (int i=0;i<slave.length();i++) 00347 { 00348 slave->getRow(i,slave_row); 00349 for (int j=0;j<slave_key_indices.size();j++) 00350 key[j] = slave_row[slave_key_indices[j]]; 00351 mp.insert(make_pair(key,i)); 00352 pb->update(i); 00353 } 00354 delete pb; 00355 00356 // set the width and the length 00357 if (master_field_indices.size()>0) 00358 n_master_fields = master_field_indices.size(); 00359 else 00360 n_master_fields = master->width(); 00361 if (slave_field_indices.size()>0) 00362 n_slave_fields = slave_field_indices.size(); 00363 else 00364 n_slave_fields = slave->width(); 00365 width_ = output_matching_index + n_master_fields + n_slave_fields; 00366 if (output_the_slave) 00367 length_ = slave.length(); 00368 else 00369 length_ = master.length(); 00370 00372 fieldinfos.resize(width_); 00373 Array<VMField> master_infos = master->getFieldInfos(); 00374 Array<VMField> slave_infos = slave->getFieldInfos(); 00375 if (output_matching_index) 00376 fieldinfos[0].name="matching_index"; 00377 if (master_infos.size() > 0) 00378 { 00379 if (master_field_indices.size()>0) 00380 for (int i=0; i<n_master_fields; ++i) 00381 fieldinfos[output_matching_index+i] = master_infos[master_field_indices[i]]; 00382 else 00383 for (int i=0; i<n_master_fields; ++i) 00384 fieldinfos[output_matching_index+i] = master_infos[i]; 00385 } 00386 if (slave_infos.size() > 0) 00387 { 00388 if (slave_field_indices.size()>0) 00389 for (int i=0; i<slave_field_indices.size(); ++i) 00390 { 00391 VMField f=slave_infos[slave_field_indices[i]]; 00392 if ((master_field_indices.size()==0 && master->fieldIndex(f.name)>=0) 00393 || master_field_names.contains(f.name)) 00394 f.name = "slave." + f.name; 00395 fieldinfos[output_matching_index+n_master_fields+i] = f; 00396 } 00397 else 00398 for (int i=0; i<n_slave_fields; ++i) 00399 { 00400 VMField f=slave_infos[i]; 00401 if ((master_field_indices.size()==0 && master->fieldIndex(f.name)>=0) 00402 || master_field_names.contains(f.name)) 00403 f.name = "slave." + f.name; 00404 fieldinfos[output_matching_index+n_master_fields+i] = f; 00405 } 00406 } 00407 pb=new ProgressBar("DatedJoinVMatrix: matching the master and slave.",master->length()); 00408 // pre-compute the 'match' (N.B. this is expensive...) 00409 master2slave.resize(master->length()); 00410 master2slave.fill(-1); 00411 slave2master.resize(slave->length()); 00412 for (int i=0;i<master->length();i++) 00413 { 00414 master->getRow(i,master_row); 00415 // build a key to search in the slave vmat 00416 for (int j=0;j<master_key_indices.size();j++) 00417 key[j] = master_row[master_key_indices[j]]; 00418 00419 // get the list of matching slave rows 00420 Maptype::const_iterator it,low,upp; 00421 pair<Maptype::const_iterator,Maptype::const_iterator> matches=mp.equal_range(key); 00422 low=matches.first; 00423 upp=matches.second; 00424 if (low!=mp.end()) 00425 { 00426 PDate master_date; 00427 if (master_date_field_index>=0) 00428 master_date = float_to_date(master_row[master_date_field_index]); 00429 PDate latest_match; 00430 int n_matches=0; 00431 static TVec<int> matches; 00432 if (verbosity>1) matches.resize(0); 00433 int matching_slave_row_index = -1; 00434 // iterate over the matching slave rows 00435 for (it=low;it!=upp;++it) 00436 { 00437 slave->getRow(it->second,slave_row); 00438 if (master_date_field_index>=0) 00439 { 00440 PDate slave_date_interval_start = float_to_date(slave_row[slave_date_interval_start_field_index]); 00441 PDate slave_date_interval_end = float_to_date(slave_row[slave_date_interval_end_field_index]); 00442 if (master_date>slave_date_interval_start && master_date<=slave_date_interval_end) 00443 { 00444 if (n_matches==0 || slave_date_interval_start > latest_match) 00445 { 00446 latest_match = slave_date_interval_start; 00447 matching_slave_row_index = it->second; 00448 } 00449 n_matches++; 00450 if (verbosity>1) matches.append(it->second); 00451 } 00452 } else // no date, the LAST one will remain 00453 { 00454 matching_slave_row_index = it->second; 00455 n_matches++; 00456 if (verbosity>1) matches.append(it->second); 00457 } 00458 } 00459 if (matching_slave_row_index>=0) 00460 { 00461 master2slave[i] = matching_slave_row_index; 00462 slave2master[matching_slave_row_index].push_back(i); 00463 } 00464 if (n_matches>1 && verbosity>0) 00465 { 00466 PLWARNING("DatedJointVMatrix:getRow(%d,.) matched more than once\n",i); 00467 if (verbosity >1) 00468 for (int j=0;j<n_matches;j++) 00469 cerr << "master row " << i << " matched slave row " << matches[j] << endl; 00470 } 00471 } 00472 pb->update(i); 00473 } 00474 delete pb; 00475 } 00476 } 00477 00478 // ### Nothing to add here, simply calls build_ 00479 void DatedJoinVMatrix::build() 00480 { 00481 inherited::build(); 00482 build_(); 00483 } 00484 00485 void DatedJoinVMatrix::makeDeepCopyFromShallowCopy(map<const void*, void*>& copies) 00486 { 00487 inherited::makeDeepCopyFromShallowCopy(copies); 00488 00489 deepCopyField(slave_row, copies); 00490 deepCopyField(key, copies); 00491 deepCopyField(master2slave, copies); 00492 deepCopyField(slave2master, copies); 00493 deepCopyField(master, copies); 00494 deepCopyField(slave, copies); 00495 deepCopyField(master_key_indices, copies); 00496 deepCopyField(slave_key_indices, copies); 00497 deepCopyField(master_key_names, copies); 00498 deepCopyField(slave_key_names, copies); 00499 deepCopyField(slave_field_indices, copies); 00500 deepCopyField(slave_field_names, copies); 00501 } 00502 00503 } // end of namespace PLearn 00504

Generated on Tue Aug 17 15:51:01 2004 for PLearn by doxygen 1.3.7