PLearn: VMat_maths.cc Source File

00001 // -*- C++ -*- 00002 00003 // PLearn (A C++ Machine Learning Library) 00004 // Copyright (C) 1998 Pascal Vincent 00005 // Copyright (C) 1999-2002 Pascal Vincent, Yoshua Bengio and University of Montreal 00006 // 00007 00008 // Redistribution and use in source and binary forms, with or without 00009 // modification, are permitted provided that the following conditions are met: 00010 // 00011 // 1. Redistributions of source code must retain the above copyright 00012 // notice, this list of conditions and the following disclaimer. 00013 // 00014 // 2. Redistributions in binary form must reproduce the above copyright 00015 // notice, this list of conditions and the following disclaimer in the 00016 // documentation and/or other materials provided with the distribution. 00017 // 00018 // 3. The name of the authors may not be used to endorse or promote 00019 // products derived from this software without specific prior written 00020 // permission. 00021 // 00022 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00023 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00024 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00025 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00026 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00027 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00028 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00029 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00030 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00031 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00032 // 00033 // This file is part of the PLearn library. For more information on the PLearn 00034 // library, go to the PLearn Web site at www.plearn.org 00035 00036 00037 00038 /* 00039 * $Id: VMat_maths.cc,v 1.27 2004/08/11 12:28:46 tihocan Exp $ 00040 * This file is part of the PLearn library. 00041 ******************************************************* */ 00042 #include "VMat_maths.h" 00043 #include <plearn/math/TMat_maths.h> 00044 #include <plearn/io/IntVecFile.h> 00045 #include "MemoryVMatrix.h" 00046 #include "ShiftAndRescaleVMatrix.h" 00047 #include "ExtendedVMatrix.h" 00048 #include <plearn/base/Array.h> 00049 #include <plearn/math/plapack.h> 00050 #include <plearn/math/random.h> 00051 #include <plearn/io/TmpFilenames.h> 00052 #include <plearn/io/fileutils.h> 00053 #include <plearn/sys/PLMPI.h> 00054 #include <vector> 00055 #include <plearn/math/VecStatsCollector.h> 00056 #include <plearn/math/ConditionalStatsCollector.h> 00057 #include <plearn/math/stats_utils.h> 00058 #include <plearn/math/BottomNI.h> 00059 00060 namespace PLearn { 00061 using namespace std; 00062 00064 void computeWeightedMean(Vec weights, VMat d, Vec& meanvec) 00065 { 00066 int w = d->width(); 00067 int l = d->length(); 00068 if(weights.length() != l) { 00069 PLERROR("In VMat.cc, method computeMean: weights.length() != d->length()\n"); 00070 } 00071 meanvec.resize(w); 00072 meanvec.clear(); 00073 Vec samplevec(w); 00074 for(int i=0; i<l; i++) 00075 { 00076 d->getRow(i,samplevec); 00077 meanvec += weights[i]*samplevec; 00078 } 00079 meanvec /= sum(weights); 00080 } 00081 00082 00083 void computeRange(VMat d, Vec& minvec, Vec& maxvec) 00084 { 00085 int l = d.length(); 00086 int w = d.width(); 00087 minvec.resize(w); 00088 maxvec.resize(w); 00089 minvec.fill(FLT_MAX); 00090 maxvec.fill(-FLT_MAX); 00091 00092 Vec v(w); 00093 for(int i=0; i<l; i++) 00094 { 00095 d->getRow(i,v); 00096 for(int j=0; j<w; j++) 00097 { 00098 minvec[j] = min(v[j],minvec[j]); 00099 maxvec[j] = max(v[j],maxvec[j]); 00100 } 00101 } 00102 } 00103 00104 void computeRowMean(VMat d, Vec& meanvec) 00105 { 00106 meanvec.resize(d->length()); 00107 Vec samplevec(d->width()); 00108 for(int i=0; i<d->length(); i++) 00109 { 00110 d->getRow(i,samplevec); 00111 meanvec[i] = mean(samplevec); 00112 } 00113 } 00114 00115 void computeMean(VMat d, Vec& meanvec) 00116 { 00117 meanvec.resize(d->width()); 00118 meanvec.clear(); 00119 Vec samplevec(d->width()); 00120 for(int i=0; i<d->length(); i++) 00121 { 00122 d->getRow(i,samplevec); 00123 meanvec += samplevec; 00124 } 00125 meanvec /= real(d->length()); 00126 } 00127 00128 Mat computeBasicStats(VMat m) 00129 { 00130 Vec v(m.width()); 00131 real* vdata = v.data(); 00132 Mat stats(10,m.width()); 00133 Vec mean_row = stats(MEAN_ROW); 00134 Vec stddev_row = stats(STDDEV_ROW); 00135 Vec min_row = stats(MIN_ROW); 00136 Vec max_row = stats(MAX_ROW); 00137 Vec nmissing_row = stats(NMISSING_ROW); 00138 Vec nzero_row = stats(NZERO_ROW); 00139 Vec npositive_row = stats(NPOSITIVE_ROW); 00140 Vec nnegative_row = stats(NNEGATIVE_ROW); 00141 Vec meanpos_row = stats(MEANPOS_ROW); 00142 Vec stddevpos_row = stats(STDDEVPOS_ROW); 00143 min_row.fill(FLT_MAX); 00144 max_row.fill(-FLT_MAX); 00145 00146 for(int i=0; i<m.length(); i++) 00147 { 00148 m->getRow(i,v); 00149 for(int j=0; j<v.length(); j++) 00150 { 00151 real val = vdata[j]; 00152 if(is_missing(val)) 00153 nmissing_row[j]++; 00154 else 00155 { 00156 if(val<min_row[j]) 00157 min_row[j] = val; 00158 if(val>max_row[j]) 00159 max_row[j] = val; 00160 00161 if(val==0.) 00162 nzero_row[j]++; 00163 else if(val>0.) 00164 { 00165 npositive_row[j]++; 00166 mean_row[j] += val; 00167 stddev_row[j] += val*val; 00168 meanpos_row[j] += val; 00169 stddevpos_row[j] += val*val; 00170 } 00171 else // val < 0. 00172 { 00173 nnegative_row[j]++; 00174 mean_row[j] += val; 00175 stddev_row[j] += val*val; 00176 } 00177 } 00178 } 00179 } 00180 for(int j=0; j<stats.width(); j++) 00181 { 00182 real nnonmissing = nzero_row[j]+nnegative_row[j]+npositive_row[j]; 00183 mean_row[j] /= nnonmissing; 00184 meanpos_row[j] /= npositive_row[j]; 00185 stddev_row[j] = sqrt(stddev_row[j]/nnonmissing - square(mean_row[j])); 00186 stddevpos_row[j] = sqrt(stddevpos_row[j]/npositive_row[j] - square(meanpos_row[j])); 00187 } 00188 return stats; 00189 } 00190 00191 void computeStats(VMat m, VecStatsCollector& st, bool report_progress) 00192 { 00193 st.forget(); 00194 st.setFieldNames(m->fieldNames()); 00195 Vec v(m.width()); 00196 int l = m.length(); 00197 ProgressBar* pbar = 0; 00198 if (report_progress) 00199 pbar = new ProgressBar("Computing statistics", l); 00200 for(int i=0; i<l; i++) 00201 { 00202 m->getRow(i,v); 00203 st.update(v); 00204 if (report_progress) 00205 pbar->update(i); 00206 } 00207 if (pbar) 00208 delete pbar; 00209 st.finalize(); 00210 } 00211 00212 TVec<StatsCollector> computeStats(VMat m, int maxnvalues, bool report_progress) 00213 { 00214 int w = m.width(); 00215 int l = m.length(); 00216 TVec<StatsCollector> stats(w, StatsCollector(maxnvalues)); 00217 Vec v(w); 00218 ProgressBar* pbar = 0; 00219 if (report_progress) 00220 pbar = new ProgressBar("Computing statistics", l); 00221 for(int i=0; i<l; i++) 00222 { 00223 m->getRow(i,v); 00224 for(int j=0; j<w; j++) 00225 stats[j].update(v[j]); 00226 if (report_progress) 00227 pbar->update(i); 00228 } 00229 if (pbar) 00230 delete pbar; 00231 return stats; 00232 } 00233 00235 PP<ConditionalStatsCollector> computeConditionalStats(VMat m, int condfield, TVec<RealMapping> ranges) 00236 { 00237 PP<ConditionalStatsCollector> condst = new ConditionalStatsCollector; 00238 condst->setBinMappingsAndCondvar(ranges, condfield); 00239 int l = m.length(); 00240 int w = m.width(); 00241 Vec v(w); 00242 for(int i=0; i<l; i++) 00243 { 00244 if(i%10000==0) 00245 cerr << "computeConditionalStats: row " << i << " of " << l << endl; 00246 m->getRow(i,v); 00247 condst->update(v); 00248 } 00249 return condst; 00250 } 00251 00252 // mean = sum/n 00253 // variance = (sumsquare-square(sum)/n)/(n-1) 00254 // stddev_of_mean = sqrt(variance/n); 00255 // mse = sumsquare/n - square(sum/n) 00256 // stddev_of_mse = variance*sqrt(2./n); 00257 00258 Array<Mat> computeConditionalMeans(VMat trainset, int targetsize, Mat& basic_stats) 00259 { 00260 if(!basic_stats) 00261 basic_stats = computeBasicStats(trainset); 00262 00263 int inputsize = trainset.width()-targetsize; 00264 Array<Mat> a(inputsize); 00265 for(int j=0; j<inputsize; j++) 00266 { 00267 real minval = basic_stats(MIN_ROW,j); 00268 real maxval = basic_stats(MAX_ROW,j); 00269 if(is_integer(minval) && is_integer(maxval) && maxval-minval<400) 00270 { 00271 a[j] = Mat(int(maxval-minval+1),2+targetsize*4); 00272 for(int k=0; k<a[j].length(); k++) 00273 a[j](k,0) = minval+k; 00274 } 00275 } 00276 00277 Vec row(trainset.width()); 00278 Vec input = row.subVec(0,inputsize); 00279 Vec target = row.subVec(inputsize,targetsize); 00280 for(int i=0; i<trainset.length(); i++) 00281 { 00282 trainset->getRow(i,row); 00283 for(int j=0; j<inputsize; j++) 00284 { 00285 Mat& m = a[j]; 00286 if(m.isNotEmpty()) 00287 { 00288 int k = int(input[j]-basic_stats(MIN_ROW,j)); 00289 Vec m_k = m(k); 00290 m_k[1]++; 00291 for(int l=0; l<targetsize; l++) 00292 { 00293 real targetval = target[l]; 00294 m_k[2+4*l] += targetval; 00295 m_k[3+4*l] += square(targetval); 00296 } 00297 } 00298 } 00299 } 00300 00301 // postprocessing: 00302 for(int j=0; j<inputsize; j++) 00303 { 00304 Mat& m = a[j]; 00305 if(m.isNotEmpty()) 00306 { 00307 for(int k=0; k<m.length(); k++) 00308 { 00309 Vec m_k = m(k); 00310 real n = m_k[1]; 00311 if(n>0.) 00312 { 00313 // replace sum by mean and sumsquare by variance 00314 for(int l=0; l<targetsize; l++) 00315 { 00316 real sum = m_k[2+4*l]; 00317 real sumsquare = m_k[3+4*l]; 00318 real mean = sum/n; 00319 real variance = (sumsquare-square(sum)/n)/(n-1); 00320 real mse = sumsquare/n - square(sum/n); 00321 real stddev_of_mean = sqrt(variance/n); 00322 real stddev_of_mse = variance*sqrt(2./n); 00323 m_k[2+4*l] = mean; 00324 m_k[3+4*l] = stddev_of_mean; 00325 m_k[4+4*l] = mse; 00326 m_k[5+4*l] = stddev_of_mse; 00327 } 00328 } 00329 } 00330 } 00331 } 00332 00333 return a; 00334 } 00335 00336 void computeMeanAndVariance(VMat d, Vec& meanvec, Vec& variancevec) 00337 { 00338 computeMean(d, meanvec); 00339 variancevec.resize(d->width()); 00340 variancevec.clear(); 00341 int w = d->width(); 00342 int l = d->length(); 00343 Vec samplevec(w); 00344 Vec diffvec(w); 00345 Vec sqdiffvec(w); 00346 for(int i=0; i<l; i++) 00347 { 00348 d->getRow(i,samplevec); 00349 //variancevec += powdistance(samplevec, meanvec, 2.0); 00350 substract(samplevec,meanvec,diffvec); 00351 multiply(diffvec,diffvec,sqdiffvec); 00352 variancevec += sqdiffvec; 00353 } 00354 variancevec /= real(l-1); 00355 00356 } 00357 00358 void computeInputMean(VMat d, Vec& meanvec) 00359 { 00360 Vec input; 00361 Vec target; 00362 real weight; 00363 int l = d->length(); 00364 int n = d->inputsize(); 00365 real weightsum = 0; 00366 meanvec.resize(n); 00367 meanvec.clear(); 00368 for(int i=0; i<l; i++) 00369 { 00370 d->getExample(i,input,target,weight); 00371 weightsum += weight; 00372 multiplyAcc(meanvec,input,weight); 00373 } 00374 meanvec /= weightsum; 00375 } 00376 00377 void computeInputMeanAndCovar(VMat d, Vec& meanvec, Mat& covarmat) 00378 { 00379 Vec input; 00380 Vec target; 00381 Vec offset; 00382 real weight; 00383 int l = d->length(); 00384 int n = d->inputsize(); 00385 real weightsum = 0; 00386 meanvec.resize(n); 00387 meanvec.clear(); 00388 covarmat.resize(n,n); 00389 covarmat.clear(); 00390 offset.resize(n); 00391 for(int i=0; i<l; i++) 00392 { 00393 d->getExample(i,input,target,weight); 00394 weightsum += weight; 00395 multiplyAcc(meanvec,input,weight); 00396 if (i==0) offset << input; 00397 input-=offset; 00398 externalProductScaleAcc(covarmat, input, input, weight); 00399 } 00400 meanvec *= 1/weightsum; 00401 covarmat *= 1/weightsum; 00402 offset-=meanvec; 00403 externalProductScaleAcc(covarmat, offset, offset, real(-1)); 00404 } 00405 00406 void computeInputMeanAndVariance(VMat d, Vec& meanvec, Vec& var) 00407 { 00408 Vec input; 00409 Vec target; 00410 Vec offset; 00411 real weight; 00412 int l = d->length(); 00413 int n = d->inputsize(); 00414 real weightsum = 0; 00415 meanvec.resize(n); 00416 meanvec.clear(); 00417 var.resize(n); 00418 var.clear(); 00419 offset.resize(n); 00420 for(int i=0; i<l; i++) 00421 { 00422 d->getExample(i,input,target,weight); 00423 if (i==0) offset<<input; 00424 weightsum+=weight; 00425 for(int j=0;j<input.size();j++) 00426 { 00427 real xj = input[j]-offset[j]; 00428 var[j]+=weight*xj*xj; 00429 } 00430 multiplyAcc(meanvec,input,weight); 00431 } 00432 meanvec /= weightsum; 00433 var /= weightsum; 00434 for(int i=0;i<input.size();i++) 00435 { 00436 real mu=meanvec[i]-offset[i]; 00437 var[i]-=mu*mu; 00438 if (var[i] < 0) { 00439 // This can happen because of numerical imprecisions. 00440 var[i] = 0; 00441 } 00442 } 00443 } 00444 00445 00446 void computeWeightedMeanAndCovar(Vec weights, VMat d, Vec& meanvec, Mat& covarmat) 00447 { 00448 int w = d->width(); 00449 int l = d->length(); 00450 computeWeightedMean(weights, d, meanvec); 00451 covarmat.resize(w,w); 00452 covarmat.clear(); 00453 Vec samplevec(w); 00454 Vec diffvec(w); 00455 real weight_sum = 0; 00456 for(int i=0; i<l; i++) 00457 { 00458 d->getRow(i,samplevec); 00459 substract(samplevec,meanvec,diffvec); 00460 real weight = weights[i]; 00461 externalProductScaleAcc(covarmat, diffvec,diffvec, weight); 00462 weight_sum += weight; 00463 } 00464 covarmat *= real(1./weight_sum); 00465 } 00466 00470 real computeWeightedMeanAndCovar(VMat d, Vec& meanvec, Mat& covarmat, real threshold) 00471 { 00472 Vec samplevec; 00473 Vec diffvec; 00474 int w = d->width()-1; 00475 int l = d->length(); 00476 samplevec.resize(w+1); 00477 diffvec.resize(w); 00478 Vec input = samplevec.subVec(0,w); 00479 real& weight = samplevec[w]; 00480 00481 real weightsum = 0; 00482 00483 // Compute weighted mean 00484 meanvec.resize(w); 00485 meanvec.clear(); 00486 for(int i=0; i<l; i++) 00487 { 00488 d->getRow(i,samplevec); 00489 if(weight>threshold) 00490 { 00491 multiplyAcc(meanvec, input, weight); 00492 weightsum += weight; 00493 } 00494 } 00495 00496 meanvec *= real(1./weightsum); 00497 00498 // Compute weighted covariance matrix 00499 covarmat.resize(w,w); 00500 covarmat.clear(); 00501 for(int i=0; i<l; i++) 00502 { 00503 d->getRow(i,samplevec); 00504 if(weight>threshold) 00505 { 00506 substract(input,meanvec,diffvec); 00507 externalProductScaleAcc(covarmat, diffvec,diffvec, weight); 00508 } 00509 } 00510 00511 covarmat *= real(1./weightsum); 00512 00513 return weightsum; 00514 } 00515 00516 00518 void computeMeanAndCovar(VMat m, Vec& meanvec, Mat& covarmat, ostream& logstream) 00519 { 00520 int w = m->width(); 00521 int l = m->length(); 00522 meanvec.resize(w); 00523 covarmat.resize(w,w); 00524 00525 MemoryVMatrix* memvm = dynamic_cast<MemoryVMatrix*>((VMatrix*)m); 00526 if(memvm) 00527 computeMeanAndCovar(m->toMat(), meanvec, covarmat); 00528 else 00529 { 00530 meanvec.clear(); 00531 covarmat.clear(); 00532 Vec v(w); 00533 00534 ProgressBar progbar(logstream,"Computing covariance",l); 00535 00536 if(USING_MPI && PLMPI::synchronized && PLMPI::size>1) 00537 { 00538 #if USING_MPI 00539 PLMPI::synchronized = false; 00540 00541 if(!covarmat.isCompact()) 00542 PLERROR("In computeMeanAndCovar: MPI implementation cannot handle non-compact covariance matrices, please pass a compact matrix"); 00543 00544 // temporary storages for mpi 00545 Vec meanvec_b(meanvec.length()); 00546 Mat covarmat_b(covarmat.length(),covarmat.width()); 00547 00548 for(int i=PLMPI::rank; i<l; i+=PLMPI::size) 00549 { 00550 m->getRow(i,v); 00551 meanvec_b += v; 00552 externalProductAcc(covarmat_b, v, v); 00553 progbar(i); 00554 } 00555 00556 MPI_Reduce(meanvec_b.data(), meanvec.data(), meanvec.length(), PLMPI_REAL, MPI_SUM, 0, MPI_COMM_WORLD); 00557 MPI_Bcast(meanvec.data(), meanvec.length(), PLMPI_REAL, 0, MPI_COMM_WORLD); 00558 MPI_Reduce(covarmat_b.data(), covarmat.data(), covarmat.size(), PLMPI_REAL, MPI_SUM, 0, MPI_COMM_WORLD); 00559 MPI_Bcast(covarmat.data(), covarmat.size(), PLMPI_REAL, 0, MPI_COMM_WORLD); 00560 00561 PLMPI::synchronized = true; 00562 #endif 00563 } 00564 else 00565 { 00566 for(int i=0; i<l; i++) 00567 { 00568 m->getRow(i,v); 00569 meanvec += v; 00570 externalProductAcc(covarmat, v, v); 00571 progbar(i); 00572 } 00573 } 00574 00575 // get the real averages and covariances, and priors 00576 meanvec /= real(l); 00577 covarmat /= real(l); 00578 externalProductScaleAcc(covarmat,meanvec,meanvec,real(-1.)); 00579 } 00580 } 00581 00582 /* // two pass version 00583 void computeMeanAndCovar(VMat d, Vec& meanvec, Mat& covarmat) 00584 { 00585 int w = d->width(); 00586 int l = d->length(); 00587 computeMean(d, meanvec); 00588 covarmat.resize(w,w); 00589 covarmat.clear(); 00590 Vec samplevec(w); 00591 Vec diffvec(w); 00592 Mat sqdiffmat(w,w); 00593 for(int i=0; i<l; i++) 00594 { 00595 d->getRow(i,samplevec); 00596 substract(samplevec,meanvec,diffvec); 00597 externalProduct(sqdiffmat, diffvec,diffvec); 00598 covarmat += sqdiffmat; 00599 } 00600 covarmat /= l-1; 00601 } 00602 */ 00603 00604 void computeMeanAndStddev(VMat d, Vec& meanvec, Vec& stddevvec) 00605 { 00606 computeMeanAndVariance(d,meanvec,stddevvec); 00607 for(int i=0; i<stddevvec.length(); i++) 00608 stddevvec[i] = sqrt(stddevvec[i]); 00609 } 00610 00611 void autocorrelation_function(const VMat& data, Mat& acf) 00612 { 00613 int T = data.length(); 00614 int N = data.width(); 00615 acf.resize(T-2, N); 00616 00617 for(int delta=0; delta < T-2; delta++) 00618 { 00619 Vec sumT(N); 00620 Vec sumD(N); 00621 TVec<Vec> products(N); 00622 00623 // t = delta 00624 for(int k=0; k < N; k++) 00625 { 00626 real ts = data(delta, k); 00627 real ds = data(0, k); 00628 00629 sumT[k] = ts; 00630 sumD[k] = ds; 00631 00632 products[k].resize(3); 00633 products[k][0] = ts*ts; 00634 products[k][1] = ds*ds; 00635 products[k][2] = ts*ds; 00636 } 00637 00638 for(int t=delta+1; t < T; t++) 00639 { 00640 for(int k=0; k < N; k++) 00641 { 00642 real ts = data(t, k); 00643 real ds = data(t-delta, k); 00644 00645 sumT[k] += ts; 00646 sumD[k] += ds; 00647 00648 products[k][0] += ts*ts; 00649 products[k][1] += ds*ds; 00650 products[k][2] += ts*ds; 00651 } 00652 } 00653 00654 // Actual computation of the correlation 00655 for(int k=0; k < N; k++) 00656 { 00657 int count = T-delta; 00658 real multiplied_var_t = products[k][0] - square(sumT[k])/count; 00659 real multiplied_var_d = products[k][1] - square(sumD[k])/count; 00660 acf(delta, k) = (products[k][2] - sumT[k]*sumD[k]/count) / sqrt(multiplied_var_t * multiplied_var_d); 00661 } 00662 } 00663 } 00664 00665 00666 VMat normalize(VMat d, Vec meanvec, Vec stddevvec) 00667 { 00668 int inputsize = meanvec.length(); 00669 00670 Vec shiftvec(d.width(),0.0); 00671 shiftvec.subVec(0,inputsize) << meanvec; 00672 negateElements(shiftvec); 00673 00674 Vec scalevec(d.width(),1.0); 00675 scalevec.subVec(0,inputsize) << stddevvec; 00676 invertElements(scalevec); 00677 00678 return new ShiftAndRescaleVMatrix(d, shiftvec, scalevec); 00679 } 00680 00681 VMat normalize(VMat d, int inputsize, int ntrain) 00682 { 00683 Vec meanvec(inputsize); 00684 Vec stddevvec(inputsize); 00685 computeMeanAndStddev(d.subMat(0,0,ntrain,inputsize), meanvec, stddevvec); 00686 return normalize(d, meanvec, stddevvec); 00687 } 00688 00689 VMat grep(VMat d, int col, Vec values, bool exclude) 00690 { 00691 Vec indices(d.length()); 00692 int nrows = 0; 00693 for(int i=0; i<d.length(); i++) 00694 { 00695 bool contains = values.contains(d(i,col)); 00696 if( (!exclude && contains) || (exclude && !contains) ) 00697 indices[nrows++] = i; 00698 } 00699 indices = indices.subVec(0,nrows); 00700 return d.rows(indices.copy()); 00701 } 00702 00704 map<real, int> countOccurencesInColumn(VMat m, int col) 00705 { 00706 map<real, int> counts; // result we will return 00707 map<real, int>::iterator found; 00708 int l = m.length(); 00709 for(int i=0; i<l; i++) 00710 { 00711 real val = m(i,col); 00712 if (is_missing(val)) 00713 // The 'nan' real value has to be dealt with separately. Here, we just 00714 // raise an error to keep it simple. 00715 PLERROR("In countOccurencesInColumn - Found a missing value, this case is currently not handled"); 00716 found = counts.find(val); 00717 if(found==counts.end()) 00718 counts[val] = 1; 00719 else 00720 found->second++; 00721 } 00722 return counts; 00723 } 00724 00726 map<real, TVec<int> > indicesOfOccurencesInColumn(VMat m, int col) 00727 { 00728 map< real, TVec<int> > indices; // result we will return 00729 map<real, int> counts = countOccurencesInColumn(m,col); 00730 map<real, int>::iterator it = counts.begin(); 00731 map<real, int>::iterator itend = counts.end(); 00732 for(; it!=itend; ++it) 00733 { 00734 indices[it->first].resize(it->second); // allocate the exact amount of memory 00735 indices[it->first].resize(0); // reset the size to 0 so we can do appends... 00736 } 00737 int l = m.length(); 00738 for(int i=0; i<l; i++) 00739 indices[m(i,col)].push_back(i); 00740 return indices; 00741 } 00742 00743 VMat grep(VMat d, int col, Vec values, const string& indexfile, bool exclude) 00744 { 00745 if(!file_exists(indexfile)) 00746 { 00747 IntVecFile indices(indexfile,true); 00748 for(int i=0; i<d.length(); i++) 00749 { 00750 bool contains = values.contains(d(i,col)); 00751 if( (!exclude && contains) || (exclude && !contains) ) 00752 indices.append(i); 00753 } 00754 } 00755 return d.rows(indexfile); 00756 } 00757 00758 VMat filter(VMat d, const string& indexfile) 00759 { 00760 if(!file_exists(indexfile) || file_size(indexfile)==0) 00761 { 00762 IntVecFile indices(indexfile,true); 00763 Vec v(d.width()); 00764 for(int i=0; i<d.length(); i++) 00765 { 00766 d->getRow(i,v); 00767 if(!v.hasMissing()) 00768 indices.append(i); 00769 } 00770 } 00771 return d.rows(indexfile); 00772 } 00773 00774 00775 VMat shuffle(VMat d) 00776 { 00777 Vec indices(0, d.length()-1, 1); // Range-vector 00778 shuffleElements(indices); 00779 return d.rows(indices); 00780 } 00781 00782 VMat bootstrap(VMat d, bool reorder, bool norepeat) 00783 { 00784 Vec indices; 00785 if (norepeat) 00786 { 00787 indices = Vec(0, d.length()-1, 1); // Range-vector 00788 shuffleElements(indices); 00789 indices = indices.subVec(0,int(0.667 * d.length())); 00790 if (reorder) 00791 sortElements(indices); 00792 return d.rows(indices); 00793 } 00794 else 00795 { 00796 indices.resize(d.length()); 00797 for (int i=0;i<d.length();i++) 00798 indices[i] = uniform_multinomial_sample(d.length()); 00799 } 00800 if (reorder) 00801 sortElements(indices); 00802 return d.rows(indices); 00803 } 00804 00805 Mat transposeProduct(VMat m) 00806 { 00807 Mat result(m.width(),m.width()); 00808 00809 Vec v(m.width()); 00810 Mat vrowmat = rowmatrix(v); 00811 00812 for(int i=0; i<m.length(); i++) 00813 { 00814 m->getRow(i,v); 00815 transposeProductAcc(result, vrowmat,vrowmat); 00816 } 00817 return result; 00818 } 00819 00820 Mat transposeProduct(VMat m1, VMat m2) 00821 { 00822 if(m1.length()!=m2.length()) 00823 PLERROR("in Mat transposeProduct(VMat m1, VMat m2) arguments have incompatible dimensions"); 00824 00825 Mat result(m1.width(),m2.width()); 00826 00827 Vec v1(m1.width()); 00828 Vec v2(m2.width()); 00829 Mat v1rowmat = rowmatrix(v1); 00830 Mat v2rowmat = rowmatrix(v2); 00831 00832 for(int i=0; i<m1.length(); i++) 00833 { 00834 m1->getRow(i,v1); 00835 m2->getRow(i,v2); 00836 transposeProductAcc(result, v1rowmat,v2rowmat); 00837 } 00838 return result; 00839 } 00840 00841 Vec transposeProduct(VMat m1, Vec v2) 00842 { 00843 if(m1.length()!=v2.length()) 00844 PLERROR("in Mat transposeProduct(VMat m1, Vec v2) arguments have incompatible dimensions"); 00845 00846 Vec result(m1.width(),1); 00847 result.clear(); 00848 00849 Vec v1(m1.width()); 00850 for(int i=0; i<m1.length(); i++) 00851 { 00852 m1->getRow(i,v1); 00853 result += v1 * v2[i]; 00854 } 00855 return result; 00856 } 00857 00858 Mat productTranspose(VMat m1, VMat m2) 00859 { 00860 if(m1.width()!=m2.width()) 00861 PLERROR("in Mat productTranspose(VMat m1, VMat m2) arguments have incompatible dimensions"); 00862 00863 int m1l = (m1.length()); 00864 int m2l = (m2.length()); 00865 int w = (m1.width()); 00866 Mat result(m1l,m2l); 00867 00868 Vec v1(w); 00869 Vec v2(w); 00870 00871 for(int i=0; i<m1l; i++) 00872 { 00873 m1->getRow(i,v1); 00874 for(int j=0; j<m2l; j++) 00875 { 00876 m2->getRow(j,v2); 00877 result(i,j) = dot(v1,v2); 00878 } 00879 } 00880 return result; 00881 } 00882 00883 Mat product(Mat m1, VMat m2) 00884 { 00885 if(m1.width()!=m2.length()) 00886 PLERROR("in Mat product(VMat m1, VMat m2) arguments have incompatible dimensions"); 00887 00888 Mat result(m1.length(),m2.width()); 00889 result.clear(); 00890 00891 Vec v2(m2.width()); 00892 Mat v2rowmat = rowmatrix(v2); 00893 00894 for(int i=0; i<m1.width(); i++) 00895 { 00896 m2->getRow(i,v2); 00897 productAcc(result, m1.column(i), v2rowmat); 00898 } 00899 return result; 00900 } 00901 00902 VMat transpose(VMat m1) 00903 { 00904 return VMat(transpose(m1.toMat())); 00905 } 00906 00907 real linearRegression(VMat inputs, VMat outputs, real weight_decay, Mat theta_t, 00908 bool use_precomputed_XtX_XtY, Mat XtX, Mat XtY, real& sum_squared_Y, 00909 bool return_squared_loss, int verbose_every, bool cholesky) 00910 { 00911 if (outputs.length()!=inputs.length()) 00912 PLERROR("linearRegression: inputs.length()=%d while outputs.length()=%d",inputs.length(),outputs.length()); 00913 if (theta_t.length()!=inputs.width()+1 || theta_t.width()!=outputs.width()) 00914 PLERROR("linearRegression: theta_t(%d,%d) should be (%dx%d)", 00915 theta_t.length(),theta_t.width(),inputs.width()+1,outputs.width()); 00916 00917 int inputsize = inputs.width(); 00918 int targetsize = outputs.width(); 00919 00920 if(XtX.length()!=inputsize+1 || XtX.width()!=inputsize+1) 00921 PLERROR("In linearRegression: XtX should have dimensions %dx%d (inputs.width()+1)x(inputs.width()+1)",inputsize+1,inputsize+1); 00922 if(XtY.length()!=inputsize+1 || XtY.width()!=targetsize) 00923 PLERROR("In linearRegression: XtY should have dimensions %dx%d (inputs.width()+1)x(outputs.width())",inputsize+1,targetsize); 00924 00925 if(!use_precomputed_XtX_XtY) // then compute them 00926 { 00927 VMat X = new ExtendedVMatrix(inputs,0,0,1,0,1.0); // prepend a first column of ones 00928 VMat Y = outputs; 00929 // ************* 00930 // Do efficiently the following: 00931 // XtX << transposeProduct(X); // '<<' to copy elements (as transposeProduct returns a new matrix) 00932 // XtY << transposeProduct(X,Y); // same thing (remember '=' for Mat never copies elements) 00933 XtX.clear(); 00934 XtY.clear(); 00935 sum_squared_Y=0; 00936 Vec x(X.width()); 00937 Vec y(Y.width()); 00938 int l=X.length(); 00939 for(int i=0; i<l; i++) 00940 { 00941 X->getRow(i,x); 00942 Y->getRow(i,y); 00943 externalProductAcc(XtX, x,x); 00944 externalProductAcc(XtY, x,y); 00945 sum_squared_Y += dot(y,y); 00946 } 00947 // ************* 00948 } 00949 00950 // add weight_decay on the diagonal of XX' (except for the bias) 00951 for (int i=1; i<XtX.length(); i++) 00952 XtX(i,i) += weight_decay; 00953 00954 if (cholesky) { 00955 // now solve by Cholesky decomposition 00956 solveLinearSystemByCholesky(XtX,XtY,theta_t); 00957 } else { 00958 theta_t = solveLinearSystem(XtX, XtY); 00959 } 00960 00961 real squared_loss=0; 00962 if (return_squared_loss) 00963 { 00964 // squared loss = sum_{ij} theta_{ij} (X'W X theta')_{ij} + sum_{t,i} Y_{ti}^2 - 2 sum_{ij} theta_{ij} (X'W Y)_{ij} 00965 Mat M(inputsize+1,targetsize); 00966 product(M,XtX,theta_t); 00967 squared_loss += dot(M,theta_t); // 00968 squared_loss += sum_squared_Y; 00969 squared_loss -= 2*dot(XtY,theta_t); 00970 } 00971 return squared_loss; 00972 } 00973 00974 Mat linearRegression(VMat inputs, VMat outputs, real weight_decay) 00975 { 00976 int n = inputs.width()+1; 00977 int n_outputs = outputs.width(); 00978 Mat XtX(n,n); 00979 Mat XtY(n,n_outputs); 00980 Mat theta_t(n,n_outputs); 00981 real sy=0; 00982 linearRegression(inputs, outputs, weight_decay, theta_t, false, XtX, XtY,sy); 00983 return theta_t; 00984 } 00985 00986 00987 real weightedLinearRegression(VMat inputs, VMat outputs, VMat gammas, real weight_decay, Mat theta_t, 00988 bool use_precomputed_XtX_XtY, Mat XtX, Mat XtY, real& sum_squared_Y, 00989 real& sum_gammas, bool return_squared_loss, int verbose_every, 00990 bool cholesky) 00991 { 00992 int inputsize = inputs.width(); 00993 int targetsize = outputs.width(); 00994 if (outputs.length()!=inputs.length()) 00995 PLERROR("linearRegression: inputs.length()=%d while outputs.length()=%d",inputs.length(),outputs.length()); 00996 if (theta_t.length()!=inputsize+1 || theta_t.width()!=targetsize) 00997 PLERROR("linearRegression: theta_t(%d,%d) should be (%dx%d)", 00998 theta_t.length(),theta_t.width(),inputsize+1,targetsize); 00999 01000 int l=inputs.length(); 01001 if(!use_precomputed_XtX_XtY) // then compute them 01002 { 01003 XtX.clear(); 01004 XtY.clear(); 01005 VMat X = new ExtendedVMatrix(inputs,0,0,1,0,1.0); // prepend a first column of ones 01006 VMat Y = outputs; 01007 01008 // Prepare to comnpute weighted XtX and XtY 01009 Vec x(X.width()); 01010 Vec y(Y.width()); 01011 real gamma_i; 01012 for(int i=0; i<l; i++) 01013 { 01014 X->getRow(i,x); 01015 Y->getRow(i,y); 01016 gamma_i = gammas(i,0); 01017 externalProductScaleAcc(XtX, x,x,gamma_i); 01018 externalProductScaleAcc(XtY, x,y,gamma_i); 01019 sum_squared_Y += gamma_i * dot(y,y); 01020 sum_gammas += gamma_i; 01021 } 01022 } 01023 01024 // add weight_decay on the diagonal of XX' (except for the bias) 01025 for (int i=1; i<XtX.length(); i++) 01026 XtX(i,i) += weight_decay; 01027 01028 if (cholesky) { 01029 // now solve by Cholesky decomposition 01030 solveLinearSystemByCholesky(XtX,XtY,theta_t); 01031 } else { 01032 theta_t = solveLinearSystem(XtX, XtY); 01033 } 01034 01035 real squared_loss=0; 01036 if (return_squared_loss) 01037 { 01038 // squared loss = sum_{ij} theta_{ij} (X'W X theta')_{ij} + sum_{t,i} gamma_t*Y_{ti}^2 - 2 sum_{ij} theta_{ij} (X'W Y)_{ij} 01039 Mat M(inputsize+1,targetsize); 01040 product(M,XtX,theta_t); 01041 squared_loss += dot(M,theta_t); // 01042 squared_loss += sum_squared_Y; 01043 squared_loss -= 2*dot(XtY,theta_t); 01044 } 01045 return squared_loss/l; 01046 } 01047 01049 Mat weightedLinearRegression(VMat inputs, VMat outputs, VMat gammas, real weight_decay) 01050 { 01051 int n = inputs.width()+1; 01052 int n_outputs = outputs.width(); 01053 Mat XtX(n,n); 01054 Mat XtY(n,n_outputs); 01055 Mat theta_t(n,n_outputs); 01056 real sy=0; 01057 real sg=0; 01058 weightedLinearRegression(inputs, outputs, gammas, weight_decay, theta_t, false, XtX, XtY,sy,sg); 01059 return theta_t; 01060 } 01061 01062 01063 VMat rebalanceNClasses(VMat inputs, int nclasses, const string& filename) 01064 { 01065 if (!file_exists(filename)) 01066 { 01067 IntVecFile indices(filename, true); 01068 Vec last = inputs.lastColumn()->toMat().toVecCopy(); 01069 const int len = last.length(); 01070 Vec capacity(nclasses); 01071 Array<Vec> index(nclasses); 01072 Array<Vec> index_used(nclasses); 01073 for (int i=0; i<nclasses; i++) index[i].resize(len); 01074 for (int i=0; i<nclasses; i++) index_used[i].resize(len); 01075 real** p_index; 01076 p_index = new real*[nclasses]; 01077 for (int i=0; i<nclasses; i++) p_index[i] = index[i].data(); 01078 for (int i=0; i<nclasses; i++) index_used[i].clear(); 01079 for (int i=0; i<len; i++) 01080 { 01081 int class_i = int(last[i]); 01082 *p_index[class_i]++ = i; 01083 capacity[class_i]++; 01084 } 01085 for (int i=0; i<nclasses; i++) index[i].resize(int(capacity[i])); 01086 for (int i=0; i<nclasses; i++) index_used[i].resize(int(capacity[i])); 01087 01088 Mat class_length(nclasses,2); 01089 for (int i=0; i<nclasses; i++) 01090 { 01091 class_length(i,0) = capacity[i]; 01092 class_length(i,1) = i; 01093 } 01094 sortRows(class_length); 01095 Vec remap = class_length.column(1).toVecCopy(); 01096 01097 vector<int> n(nclasses,0); 01098 int new_index = -1; 01099 for (int i=0; i<len; i++) 01100 { 01101 int c = i%nclasses; 01102 int c_map = int(remap[c]); 01103 if (c == 0) 01104 { 01105 if (n[0] == capacity[c_map]) n[0] = 0; 01106 new_index = int(index[c_map][n[0]++]); 01107 } 01108 else 01109 { 01110 if (n[c] == capacity[c_map]) 01111 { 01112 n[c] = 0; 01113 index_used[c_map].clear(); 01114 } 01115 bool index_found = false; 01116 int start_pos = binary_search(index[c_map], real(new_index)); 01117 for (int j=start_pos+1; j<capacity[c_map]; j++) 01118 { 01119 if (index_used[c_map][j] == 0) 01120 { 01121 index_used[c_map][j] = 1; 01122 new_index = int(index[c_map][j]); 01123 index_found = true; 01124 n[c]++; 01125 break; 01126 } 01127 } 01128 if (!index_found) 01129 { 01130 for (int j=0; j<start_pos; j++) 01131 { 01132 if (index_used[c_map][j] == 0) 01133 { 01134 index_used[c_map][j] = 1; 01135 new_index = int(index[c_map][j]); 01136 index_found = true; 01137 n[c]++; 01138 break; 01139 } 01140 } 01141 } 01142 if (!index_found) 01143 PLERROR("In rebalanceNClasses: something got wrong!"); 01144 } 01145 indices.put(i, new_index); 01146 } 01147 01148 delete[] p_index; 01149 } 01150 return inputs.rows(filename); 01151 } 01152 01153 void fullyRebalance2Classes(VMat inputs, const string& filename, bool save_indices) 01154 { 01155 if (!file_exists(filename)) 01156 { 01157 int len = inputs.length(); 01158 01159 int n_zeros = 0; 01160 int n_ones = 0; 01161 Vec zeros(len); 01162 Vec ones(len); 01163 01164 Vec last = inputs.lastColumn()->toMat().toVecCopy(); 01165 for (int i=0; i<len;i++) 01166 { 01167 if (last[i] == 0) 01168 zeros[n_zeros++] = i; 01169 else 01170 ones[n_ones++] = i; 01171 } 01172 zeros.resize(n_zeros); 01173 ones.resize(n_ones); 01174 01175 TmpFilenames tmpfile(1); 01176 string fname = save_indices ? filename : tmpfile.addFilename(); 01177 IntVecFile indices(fname, true); 01178 int max_symbols = MAX(n_zeros, n_ones); 01179 for (int i=0; i<max_symbols; i++) 01180 { 01181 indices.put(2*i, int(zeros[i%n_zeros])); 01182 indices.put(2*i+1, int(ones[i%n_ones])); 01183 } 01184 if (!save_indices) 01185 { 01186 VMat vm = inputs.rows(fname); 01187 vm.save(filename); 01188 } 01189 } 01190 } 01191 01192 VMat temporalThreshold(VMat distr, int threshold_date, bool is_before, 01193 int yyyymmdd_col) 01194 { 01195 Vec indices(distr->length()); 01196 int n_data = 0; 01197 for (int i=0; i<distr->length(); i++) 01198 { 01199 int reference_date = (int)distr(i, yyyymmdd_col); 01200 if (is_before ? reference_date<=threshold_date : reference_date>=threshold_date) 01201 indices[n_data++] = i; 01202 } 01203 indices.resize(n_data); 01204 01205 return distr.rows(indices); 01206 } 01207 01208 VMat temporalThreshold(VMat distr, int threshold_date, bool is_before, 01209 int yyyy_col, int mm_col, int dd_col) 01210 { 01211 Vec indices(distr->length()); 01212 int n_data = 0; 01213 for (int i=0; i<distr->length(); i++) 01214 { 01215 int reference_date = 10000*(int)distr(i, yyyy_col) + 100*(int)distr(i, mm_col) + (int)distr(i, dd_col); 01216 if (is_before ? reference_date<=threshold_date : reference_date>=threshold_date) 01217 indices[n_data++] = i; 01218 } 01219 indices.resize(n_data); 01220 01221 return distr.rows(indices); 01222 } 01223 01228 void correlations(const VMat& x, const VMat& y, Mat& r, Mat& pvalues) 01229 { 01230 int n=x.length(); 01231 if (n!=y.length()) 01232 PLERROR("correlations: x and y must have the same length"); 01233 int wx=x.width(); 01234 int wy=y.width(); 01235 r.resize(wx,wy); 01236 r.clear(); 01237 Mat sxy(wx,wy); 01238 Vec sx2(wx); 01239 Vec sy2(wx); 01240 Vec sx(wx); 01241 Vec sy(wx); 01242 Vec xt(wx); 01243 Vec yt(wy); 01244 for (int t=0;t<n;t++) 01245 { 01246 x->getRow(t,xt); 01247 y->getRow(t,yt); 01248 for (int j=0;j<wy;j++) 01249 { 01250 real ytj = yt[j]; 01251 sy[j] += ytj; 01252 sy2[j] += ytj*ytj; 01253 for (int i=0;i<wx;i++) 01254 { 01255 real xti = xt[i]; 01256 sxy(i,j) += xti*ytj; 01257 sx[i] += xti; 01258 sx2[i] += xti*xti; 01259 } 01260 } 01261 } 01262 for (int i=0;i<wx;i++) 01263 for (int j=0;j<wy;j++) 01264 { 01265 real nv = (sx2[i] - sx[i]/n*sx[i]); // = n * variance of x 01266 if (nv>0) // don't bother if variance is 0 01267 r(i,j) = (n*sxy(i,j)-sx[i]*sy[j])/sqrt((n*sx2[i]-sx[i]*sx[i])*(n*sy2[j]-sy[j]*sy[j])); 01268 else 01269 r(i,j) = 0; 01270 if (r(i,j)<-1.01 || r(i,j)>1.01) 01271 PLWARNING("correlation: weird correlation coefficient, %f for %d-th input, %d-target", 01272 r(i,j),i,j); 01273 } 01274 pvalues.resize(r.length(),r.width()); 01275 for (int i=0;i<r.length();i++) 01276 for (int j=0;j<r.width();j++) 01277 pvalues(i,j) = testNoCorrelationAsymptotically(r(i,j),n); 01278 01279 } 01280 01281 // This is an efficient version of the most basic nearest neighbor search, using a Mat and euclidean distance 01282 void computeNearestNeighbors(VMat dataset, Vec x, TVec<int>& neighbors, int ignore_row) 01283 { 01284 int K = neighbors.length(); // how many neighbors do we want? 01285 BottomNI<real> neighbs(K); 01286 Vec row(dataset->width()); 01287 for(int i=0; i<dataset->length(); i++) 01288 if(i!=ignore_row) 01289 { 01290 dataset->getRow(i,row); 01291 neighbs.update(powdistance(row,x), i); 01292 } 01293 neighbs.sort(); 01294 TVec< pair<real,int> > indices = neighbs.getBottomN(); 01295 int nonzero=0; 01296 for(int k=0; k<K; k++) 01297 { 01298 if(indices[k].first>0) 01299 nonzero++; 01300 neighbors[k] = indices[k].second; 01301 } 01302 if(nonzero==0) 01303 PLERROR("All neighbors had 0 distance. Use more neighbors. (There were %i other patterns with same values)",neighbs.nZeros()); 01304 } 01305 01306 01307 } // end of namespace PLearn