00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
#include "VMat_maths.h"
00043
#include <plearn/math/TMat_maths.h>
00044
#include <plearn/io/IntVecFile.h>
00045
#include "MemoryVMatrix.h"
00046
#include "ShiftAndRescaleVMatrix.h"
00047
#include "ExtendedVMatrix.h"
00048
#include <plearn/base/Array.h>
00049
#include <plearn/math/plapack.h>
00050
#include <plearn/math/random.h>
00051
#include <plearn/io/TmpFilenames.h>
00052
#include <plearn/io/fileutils.h>
00053
#include <plearn/sys/PLMPI.h>
00054
#include <vector>
00055
#include <plearn/math/VecStatsCollector.h>
00056
#include <plearn/math/ConditionalStatsCollector.h>
00057
#include <plearn/math/stats_utils.h>
00058
#include <plearn/math/BottomNI.h>
00059
00060
namespace PLearn {
00061
using namespace std;
00062
00064 void computeWeightedMean(
Vec weights,
VMat d,
Vec& meanvec)
00065 {
00066
int w = d->
width();
00067
int l = d->
length();
00068
if(weights.
length() != l) {
00069
PLERROR(
"In VMat.cc, method computeMean: weights.length() != d->length()\n");
00070 }
00071 meanvec.
resize(w);
00072 meanvec.
clear();
00073
Vec samplevec(w);
00074
for(
int i=0; i<l; i++)
00075 {
00076 d->getRow(i,samplevec);
00077 meanvec += weights[i]*samplevec;
00078 }
00079 meanvec /=
sum(weights);
00080 }
00081
00082
00083 void computeRange(
VMat d,
Vec& minvec,
Vec& maxvec)
00084 {
00085
int l = d.
length();
00086
int w = d.
width();
00087 minvec.
resize(w);
00088 maxvec.
resize(w);
00089 minvec.
fill(FLT_MAX);
00090 maxvec.
fill(-FLT_MAX);
00091
00092
Vec v(w);
00093
for(
int i=0; i<l; i++)
00094 {
00095 d->getRow(i,v);
00096
for(
int j=0; j<w; j++)
00097 {
00098 minvec[j] =
min(v[j],minvec[j]);
00099 maxvec[j] =
max(v[j],maxvec[j]);
00100 }
00101 }
00102 }
00103
00104 void computeRowMean(
VMat d,
Vec& meanvec)
00105 {
00106 meanvec.
resize(d->
length());
00107
Vec samplevec(d->
width());
00108
for(
int i=0; i<d->
length(); i++)
00109 {
00110 d->getRow(i,samplevec);
00111 meanvec[i] =
mean(samplevec);
00112 }
00113 }
00114
00115 void computeMean(
VMat d,
Vec& meanvec)
00116 {
00117 meanvec.
resize(d->
width());
00118 meanvec.
clear();
00119
Vec samplevec(d->
width());
00120
for(
int i=0; i<d->
length(); i++)
00121 {
00122 d->getRow(i,samplevec);
00123 meanvec += samplevec;
00124 }
00125 meanvec /=
real(d->
length());
00126 }
00127
00128 Mat computeBasicStats(
VMat m)
00129 {
00130
Vec v(m.
width());
00131
real* vdata = v.
data();
00132
Mat stats(10,m.
width());
00133
Vec mean_row = stats(
MEAN_ROW);
00134
Vec stddev_row = stats(
STDDEV_ROW);
00135
Vec min_row = stats(
MIN_ROW);
00136
Vec max_row = stats(
MAX_ROW);
00137
Vec nmissing_row = stats(
NMISSING_ROW);
00138
Vec nzero_row = stats(
NZERO_ROW);
00139
Vec npositive_row = stats(
NPOSITIVE_ROW);
00140
Vec nnegative_row = stats(
NNEGATIVE_ROW);
00141
Vec meanpos_row = stats(
MEANPOS_ROW);
00142
Vec stddevpos_row = stats(
STDDEVPOS_ROW);
00143 min_row.
fill(FLT_MAX);
00144 max_row.
fill(-FLT_MAX);
00145
00146
for(
int i=0; i<m.
length(); i++)
00147 {
00148 m->getRow(i,v);
00149
for(
int j=0; j<v.
length(); j++)
00150 {
00151
real val = vdata[j];
00152
if(
is_missing(
val))
00153 nmissing_row[j]++;
00154
else
00155 {
00156
if(
val<min_row[j])
00157 min_row[j] =
val;
00158
if(
val>max_row[j])
00159 max_row[j] =
val;
00160
00161
if(
val==0.)
00162 nzero_row[j]++;
00163
else if(
val>0.)
00164 {
00165 npositive_row[j]++;
00166 mean_row[j] +=
val;
00167 stddev_row[j] +=
val*
val;
00168 meanpos_row[j] +=
val;
00169 stddevpos_row[j] +=
val*
val;
00170 }
00171
else
00172 {
00173 nnegative_row[j]++;
00174 mean_row[j] +=
val;
00175 stddev_row[j] +=
val*
val;
00176 }
00177 }
00178 }
00179 }
00180
for(
int j=0; j<stats.width(); j++)
00181 {
00182
real nnonmissing = nzero_row[j]+nnegative_row[j]+npositive_row[j];
00183 mean_row[j] /= nnonmissing;
00184 meanpos_row[j] /= npositive_row[j];
00185 stddev_row[j] =
sqrt(stddev_row[j]/nnonmissing -
square(mean_row[j]));
00186 stddevpos_row[j] =
sqrt(stddevpos_row[j]/npositive_row[j] -
square(meanpos_row[j]));
00187 }
00188
return stats;
00189 }
00190
00191 void computeStats(
VMat m,
VecStatsCollector& st,
bool report_progress)
00192 {
00193 st.
forget();
00194 st.
setFieldNames(m->fieldNames());
00195
Vec v(m.
width());
00196
int l = m.
length();
00197
ProgressBar* pbar = 0;
00198
if (report_progress)
00199 pbar =
new ProgressBar(
"Computing statistics", l);
00200
for(
int i=0; i<l; i++)
00201 {
00202 m->getRow(i,v);
00203 st.
update(v);
00204
if (report_progress)
00205 pbar->
update(i);
00206 }
00207
if (pbar)
00208
delete pbar;
00209 st.
finalize();
00210 }
00211
00212 TVec<StatsCollector> computeStats(
VMat m,
int maxnvalues,
bool report_progress)
00213 {
00214
int w = m.
width();
00215
int l = m.
length();
00216
TVec<StatsCollector> stats(w,
StatsCollector(maxnvalues));
00217
Vec v(w);
00218
ProgressBar* pbar = 0;
00219
if (report_progress)
00220 pbar =
new ProgressBar(
"Computing statistics", l);
00221
for(
int i=0; i<l; i++)
00222 {
00223 m->getRow(i,v);
00224
for(
int j=0; j<w; j++)
00225 stats[j].update(v[j]);
00226
if (report_progress)
00227 pbar->
update(i);
00228 }
00229
if (pbar)
00230
delete pbar;
00231
return stats;
00232 }
00233
00235 PP<ConditionalStatsCollector> computeConditionalStats(
VMat m,
int condfield,
TVec<RealMapping> ranges)
00236 {
00237
PP<ConditionalStatsCollector> condst =
new ConditionalStatsCollector;
00238 condst->setBinMappingsAndCondvar(ranges, condfield);
00239
int l = m.
length();
00240
int w = m.
width();
00241
Vec v(w);
00242
for(
int i=0; i<l; i++)
00243 {
00244
if(i%10000==0)
00245 cerr <<
"computeConditionalStats: row " << i <<
" of " << l <<
endl;
00246 m->getRow(i,v);
00247 condst->update(v);
00248 }
00249
return condst;
00250 }
00251
00252
00253
00254
00255
00256
00257
00258 Array<Mat> computeConditionalMeans(
VMat trainset,
int targetsize,
Mat& basic_stats)
00259 {
00260
if(!basic_stats)
00261 basic_stats =
computeBasicStats(trainset);
00262
00263
int inputsize = trainset.
width()-targetsize;
00264
Array<Mat> a(inputsize);
00265
for(
int j=0; j<inputsize; j++)
00266 {
00267
real minval = basic_stats(
MIN_ROW,j);
00268
real maxval = basic_stats(
MAX_ROW,j);
00269
if(
is_integer(minval) &&
is_integer(maxval) && maxval-minval<400)
00270 {
00271 a[j] =
Mat(
int(maxval-minval+1),2+targetsize*4);
00272
for(
int k=0;
k<a[j].
length();
k++)
00273 a[j](
k,0) = minval+
k;
00274 }
00275 }
00276
00277
Vec row(trainset.
width());
00278
Vec input = row.
subVec(0,inputsize);
00279
Vec target = row.
subVec(inputsize,targetsize);
00280
for(
int i=0; i<trainset.
length(); i++)
00281 {
00282 trainset->getRow(i,row);
00283
for(
int j=0; j<inputsize; j++)
00284 {
00285
Mat& m = a[j];
00286
if(m.
isNotEmpty())
00287 {
00288
int k =
int(input[j]-basic_stats(
MIN_ROW,j));
00289
Vec m_k = m(
k);
00290 m_k[1]++;
00291
for(
int l=0; l<targetsize; l++)
00292 {
00293
real targetval = target[l];
00294 m_k[2+4*l] += targetval;
00295 m_k[3+4*l] +=
square(targetval);
00296 }
00297 }
00298 }
00299 }
00300
00301
00302
for(
int j=0; j<inputsize; j++)
00303 {
00304
Mat& m = a[j];
00305
if(m.
isNotEmpty())
00306 {
00307
for(
int k=0;
k<m.
length();
k++)
00308 {
00309
Vec m_k = m(
k);
00310
real n = m_k[1];
00311
if(n>0.)
00312 {
00313
00314
for(
int l=0; l<targetsize; l++)
00315 {
00316
real sum = m_k[2+4*l];
00317
real sumsquare = m_k[3+4*l];
00318
real mean =
sum/n;
00319
real variance = (
sumsquare-
square(
sum)/n)/(n-1);
00320
real mse =
sumsquare/n -
square(
sum/n);
00321
real stddev_of_mean =
sqrt(
variance/n);
00322
real stddev_of_mse =
variance*
sqrt(2./n);
00323 m_k[2+4*l] =
mean;
00324 m_k[3+4*l] = stddev_of_mean;
00325 m_k[4+4*l] = mse;
00326 m_k[5+4*l] = stddev_of_mse;
00327 }
00328 }
00329 }
00330 }
00331 }
00332
00333
return a;
00334 }
00335
00336 void computeMeanAndVariance(
VMat d,
Vec& meanvec,
Vec& variancevec)
00337 {
00338
computeMean(d, meanvec);
00339 variancevec.
resize(d->
width());
00340 variancevec.
clear();
00341
int w = d->
width();
00342
int l = d->
length();
00343
Vec samplevec(w);
00344
Vec diffvec(w);
00345
Vec sqdiffvec(w);
00346
for(
int i=0; i<l; i++)
00347 {
00348 d->getRow(i,samplevec);
00349
00350
substract(samplevec,meanvec,diffvec);
00351
multiply(diffvec,diffvec,sqdiffvec);
00352 variancevec += sqdiffvec;
00353 }
00354 variancevec /=
real(l-1);
00355
00356 }
00357
00358 void computeInputMean(
VMat d,
Vec& meanvec)
00359 {
00360
Vec input;
00361
Vec target;
00362
real weight;
00363
int l = d->
length();
00364
int n = d->inputsize();
00365
real weightsum = 0;
00366 meanvec.
resize(n);
00367 meanvec.
clear();
00368
for(
int i=0; i<l; i++)
00369 {
00370 d->
getExample(i,input,target,weight);
00371 weightsum += weight;
00372
multiplyAcc(meanvec,input,weight);
00373 }
00374 meanvec /= weightsum;
00375 }
00376
00377 void computeInputMeanAndCovar(
VMat d,
Vec& meanvec,
Mat& covarmat)
00378 {
00379
Vec input;
00380
Vec target;
00381
Vec offset;
00382
real weight;
00383
int l = d->
length();
00384
int n = d->inputsize();
00385
real weightsum = 0;
00386 meanvec.
resize(n);
00387 meanvec.
clear();
00388 covarmat.
resize(n,n);
00389 covarmat.
clear();
00390 offset.
resize(n);
00391
for(
int i=0; i<l; i++)
00392 {
00393 d->
getExample(i,input,target,weight);
00394 weightsum += weight;
00395
multiplyAcc(meanvec,input,weight);
00396
if (i==0) offset << input;
00397 input-=offset;
00398
externalProductScaleAcc(covarmat, input, input, weight);
00399 }
00400 meanvec *= 1/weightsum;
00401 covarmat *= 1/weightsum;
00402 offset-=meanvec;
00403
externalProductScaleAcc(covarmat, offset, offset,
real(-1));
00404 }
00405
00406 void computeInputMeanAndVariance(
VMat d,
Vec& meanvec,
Vec& var)
00407 {
00408
Vec input;
00409
Vec target;
00410
Vec offset;
00411
real weight;
00412
int l = d->
length();
00413
int n = d->inputsize();
00414
real weightsum = 0;
00415 meanvec.
resize(n);
00416 meanvec.
clear();
00417
var.resize(n);
00418
var.clear();
00419 offset.
resize(n);
00420
for(
int i=0; i<l; i++)
00421 {
00422 d->
getExample(i,input,target,weight);
00423
if (i==0) offset<<input;
00424 weightsum+=weight;
00425
for(
int j=0;j<input.size();j++)
00426 {
00427
real xj = input[j]-offset[j];
00428
var[j]+=weight*xj*xj;
00429 }
00430
multiplyAcc(meanvec,input,weight);
00431 }
00432 meanvec /= weightsum;
00433
var /= weightsum;
00434
for(
int i=0;i<input.
size();i++)
00435 {
00436
real mu=meanvec[i]-offset[i];
00437
var[i]-=mu*mu;
00438
if (
var[i] < 0) {
00439
00440
var[i] = 0;
00441 }
00442 }
00443 }
00444
00445
00446 void computeWeightedMeanAndCovar(
Vec weights,
VMat d,
Vec& meanvec,
Mat& covarmat)
00447 {
00448
int w = d->
width();
00449
int l = d->
length();
00450
computeWeightedMean(weights, d, meanvec);
00451 covarmat.
resize(w,w);
00452 covarmat.
clear();
00453
Vec samplevec(w);
00454
Vec diffvec(w);
00455
real weight_sum = 0;
00456
for(
int i=0; i<l; i++)
00457 {
00458 d->getRow(i,samplevec);
00459
substract(samplevec,meanvec,diffvec);
00460
real weight = weights[i];
00461
externalProductScaleAcc(covarmat, diffvec,diffvec, weight);
00462 weight_sum += weight;
00463 }
00464 covarmat *=
real(1./weight_sum);
00465 }
00466
00470 real computeWeightedMeanAndCovar(
VMat d,
Vec& meanvec,
Mat& covarmat,
real threshold)
00471 {
00472
Vec samplevec;
00473
Vec diffvec;
00474
int w = d->
width()-1;
00475
int l = d->
length();
00476 samplevec.
resize(w+1);
00477 diffvec.
resize(w);
00478
Vec input = samplevec.
subVec(0,w);
00479
real& weight = samplevec[w];
00480
00481
real weightsum = 0;
00482
00483
00484 meanvec.
resize(w);
00485 meanvec.
clear();
00486
for(
int i=0; i<l; i++)
00487 {
00488 d->getRow(i,samplevec);
00489
if(weight>threshold)
00490 {
00491
multiplyAcc(meanvec, input, weight);
00492 weightsum += weight;
00493 }
00494 }
00495
00496 meanvec *=
real(1./weightsum);
00497
00498
00499 covarmat.
resize(w,w);
00500 covarmat.
clear();
00501
for(
int i=0; i<l; i++)
00502 {
00503 d->getRow(i,samplevec);
00504
if(weight>threshold)
00505 {
00506
substract(input,meanvec,diffvec);
00507
externalProductScaleAcc(covarmat, diffvec,diffvec, weight);
00508 }
00509 }
00510
00511 covarmat *= real(1./weightsum);
00512
00513
return weightsum;
00514 }
00515
00516
00518 void computeMeanAndCovar(
VMat m,
Vec& meanvec,
Mat& covarmat, ostream& logstream)
00519 {
00520
int w = m->
width();
00521
int l = m->
length();
00522 meanvec.
resize(w);
00523 covarmat.
resize(w,w);
00524
00525
MemoryVMatrix* memvm = dynamic_cast<MemoryVMatrix*>((
VMatrix*)m);
00526
if(memvm)
00527
computeMeanAndCovar(m->
toMat(), meanvec, covarmat);
00528
else
00529 {
00530 meanvec.
clear();
00531 covarmat.
clear();
00532
Vec v(w);
00533
00534
ProgressBar progbar(logstream,
"Computing covariance",l);
00535
00536
if(USING_MPI && PLMPI::synchronized && PLMPI::size>1)
00537 {
00538
#if USING_MPI
00539
PLMPI::synchronized =
false;
00540
00541
if(!covarmat.
isCompact())
00542
PLERROR(
"In computeMeanAndCovar: MPI implementation cannot handle non-compact covariance matrices, please pass a compact matrix");
00543
00544
00545
Vec meanvec_b(meanvec.
length());
00546
Mat covarmat_b(covarmat.
length(),covarmat.
width());
00547
00548
for(
int i=PLMPI::rank; i<l; i+=PLMPI::size)
00549 {
00550 m->getRow(i,v);
00551 meanvec_b += v;
00552
externalProductAcc(covarmat_b, v, v);
00553 progbar(i);
00554 }
00555
00556 MPI_Reduce(meanvec_b.
data(), meanvec.
data(), meanvec.
length(), PLMPI_REAL, MPI_SUM, 0, MPI_COMM_WORLD);
00557 MPI_Bcast(meanvec.
data(), meanvec.
length(), PLMPI_REAL, 0, MPI_COMM_WORLD);
00558 MPI_Reduce(covarmat_b.
data(), covarmat.
data(), covarmat.
size(), PLMPI_REAL, MPI_SUM, 0, MPI_COMM_WORLD);
00559 MPI_Bcast(covarmat.
data(), covarmat.
size(), PLMPI_REAL, 0, MPI_COMM_WORLD);
00560
00561 PLMPI::synchronized =
true;
00562
#endif
00563
}
00564
else
00565 {
00566
for(
int i=0; i<l; i++)
00567 {
00568 m->getRow(i,v);
00569 meanvec += v;
00570
externalProductAcc(covarmat, v, v);
00571 progbar(i);
00572 }
00573 }
00574
00575
00576 meanvec /=
real(l);
00577 covarmat /=
real(l);
00578
externalProductScaleAcc(covarmat,meanvec,meanvec,
real(-1.));
00579 }
00580 }
00581
00582
00583
00584
00585
00586
00587
00588
00589
00590
00591
00592
00593
00594
00595
00596
00597
00598
00599
00600
00601
00602
00603
00604 void computeMeanAndStddev(
VMat d,
Vec& meanvec,
Vec& stddevvec)
00605 {
00606
computeMeanAndVariance(d,meanvec,stddevvec);
00607
for(
int i=0; i<stddevvec.
length(); i++)
00608 stddevvec[i] =
sqrt(stddevvec[i]);
00609 }
00610
00611 void autocorrelation_function(
const VMat& data,
Mat& acf)
00612 {
00613
int T = data.
length();
00614
int N = data.
width();
00615 acf.
resize(T-2, N);
00616
00617
for(
int delta=0; delta < T-2; delta++)
00618 {
00619
Vec sumT(N);
00620
Vec sumD(N);
00621
TVec<Vec> products(N);
00622
00623
00624
for(
int k=0;
k < N;
k++)
00625 {
00626
real ts = data(delta,
k);
00627
real ds = data(0,
k);
00628
00629 sumT[
k] = ts;
00630 sumD[
k] = ds;
00631
00632 products[
k].
resize(3);
00633 products[
k][0] = ts*ts;
00634 products[
k][1] = ds*ds;
00635 products[
k][2] = ts*ds;
00636 }
00637
00638
for(
int t=delta+1; t < T; t++)
00639 {
00640
for(
int k=0;
k < N;
k++)
00641 {
00642
real ts = data(t,
k);
00643
real ds = data(t-delta,
k);
00644
00645 sumT[
k] += ts;
00646 sumD[
k] += ds;
00647
00648 products[
k][0] += ts*ts;
00649 products[
k][1] += ds*ds;
00650 products[
k][2] += ts*ds;
00651 }
00652 }
00653
00654
00655
for(
int k=0;
k < N;
k++)
00656 {
00657
int count = T-delta;
00658
real multiplied_var_t = products[
k][0] -
square(sumT[
k])/
count;
00659
real multiplied_var_d = products[
k][1] -
square(sumD[
k])/
count;
00660 acf(delta,
k) = (products[
k][2] - sumT[
k]*sumD[
k]/
count) /
sqrt(multiplied_var_t * multiplied_var_d);
00661 }
00662 }
00663 }
00664
00665
00666 VMat normalize(
VMat d,
Vec meanvec,
Vec stddevvec)
00667 {
00668
int inputsize = meanvec.
length();
00669
00670
Vec shiftvec(d.
width(),0.0);
00671 shiftvec.
subVec(0,inputsize) << meanvec;
00672
negateElements(shiftvec);
00673
00674
Vec scalevec(d.
width(),1.0);
00675 scalevec.
subVec(0,inputsize) << stddevvec;
00676
invertElements(scalevec);
00677
00678
return new ShiftAndRescaleVMatrix(d, shiftvec, scalevec);
00679 }
00680
00681 VMat normalize(
VMat d,
int inputsize,
int ntrain)
00682 {
00683
Vec meanvec(inputsize);
00684
Vec stddevvec(inputsize);
00685
computeMeanAndStddev(d.
subMat(0,0,ntrain,inputsize), meanvec, stddevvec);
00686
return normalize(d, meanvec, stddevvec);
00687 }
00688
00689 VMat grep(
VMat d,
int col,
Vec values,
bool exclude)
00690 {
00691
Vec indices(d.
length());
00692
int nrows = 0;
00693
for(
int i=0; i<d.
length(); i++)
00694 {
00695
bool contains = values.
contains(d(i,col));
00696
if( (!exclude && contains) || (exclude && !contains) )
00697 indices[nrows++] = i;
00698 }
00699 indices = indices.
subVec(0,nrows);
00700
return d.
rows(indices.
copy());
00701 }
00702
00704 map<real, int>
countOccurencesInColumn(
VMat m,
int col)
00705 {
00706 map<real, int> counts;
00707 map<real, int>::iterator found;
00708
int l = m.
length();
00709
for(
int i=0; i<l; i++)
00710 {
00711
real val = m(i,col);
00712
if (
is_missing(
val))
00713
00714
00715
PLERROR(
"In countOccurencesInColumn - Found a missing value, this case is currently not handled");
00716 found = counts.find(
val);
00717
if(found==counts.end())
00718 counts[
val] = 1;
00719
else
00720 found->second++;
00721 }
00722
return counts;
00723 }
00724
00726 map<real, TVec<int> >
indicesOfOccurencesInColumn(
VMat m,
int col)
00727 {
00728 map< real, TVec<int> > indices;
00729 map<real, int> counts =
countOccurencesInColumn(m,col);
00730 map<real, int>::iterator it = counts.begin();
00731 map<real, int>::iterator itend = counts.end();
00732
for(; it!=itend; ++it)
00733 {
00734 indices[it->first].resize(it->second);
00735 indices[it->first].resize(0);
00736 }
00737
int l = m.
length();
00738
for(
int i=0; i<l; i++)
00739 indices[m(i,col)].push_back(i);
00740
return indices;
00741 }
00742
00743 VMat grep(
VMat d,
int col,
Vec values,
const string& indexfile,
bool exclude)
00744 {
00745
if(!
file_exists(indexfile))
00746 {
00747
IntVecFile indices(indexfile,
true);
00748
for(
int i=0; i<d.
length(); i++)
00749 {
00750
bool contains = values.
contains(d(i,col));
00751
if( (!exclude && contains) || (exclude && !contains) )
00752 indices.
append(i);
00753 }
00754 }
00755
return d.
rows(indexfile);
00756 }
00757
00758 VMat filter(
VMat d,
const string& indexfile)
00759 {
00760
if(!
file_exists(indexfile) ||
file_size(indexfile)==0)
00761 {
00762
IntVecFile indices(indexfile,
true);
00763
Vec v(d.
width());
00764
for(
int i=0; i<d.
length(); i++)
00765 {
00766 d->getRow(i,v);
00767
if(!v.
hasMissing())
00768 indices.
append(i);
00769 }
00770 }
00771
return d.
rows(indexfile);
00772 }
00773
00774
00775 VMat shuffle(
VMat d)
00776 {
00777
Vec indices(0, d.
length()-1, 1);
00778
shuffleElements(indices);
00779
return d.
rows(indices);
00780 }
00781
00782 VMat bootstrap(
VMat d,
bool reorder,
bool norepeat)
00783 {
00784
Vec indices;
00785
if (norepeat)
00786 {
00787 indices =
Vec(0, d.
length()-1, 1);
00788
shuffleElements(indices);
00789 indices = indices.
subVec(0,
int(0.667 * d.
length()));
00790
if (reorder)
00791
sortElements(indices);
00792
return d.
rows(indices);
00793 }
00794
else
00795 {
00796 indices.
resize(d.
length());
00797
for (
int i=0;i<d.
length();i++)
00798 indices[i] =
uniform_multinomial_sample(d.
length());
00799 }
00800
if (reorder)
00801
sortElements(indices);
00802
return d.
rows(indices);
00803 }
00804
00805 Mat transposeProduct(
VMat m)
00806 {
00807
Mat result(m.
width(),m.
width());
00808
00809
Vec v(m.
width());
00810
Mat vrowmat =
rowmatrix(v);
00811
00812
for(
int i=0; i<m.
length(); i++)
00813 {
00814 m->getRow(i,v);
00815
transposeProductAcc(result, vrowmat,vrowmat);
00816 }
00817
return result;
00818 }
00819
00820 Mat transposeProduct(
VMat m1,
VMat m2)
00821 {
00822
if(m1.
length()!=m2.
length())
00823
PLERROR(
"in Mat transposeProduct(VMat m1, VMat m2) arguments have incompatible dimensions");
00824
00825
Mat result(m1.
width(),m2.
width());
00826
00827
Vec v1(m1.
width());
00828
Vec v2(m2.
width());
00829
Mat v1rowmat =
rowmatrix(v1);
00830
Mat v2rowmat =
rowmatrix(v2);
00831
00832
for(
int i=0; i<m1.
length(); i++)
00833 {
00834 m1->getRow(i,v1);
00835 m2->getRow(i,v2);
00836
transposeProductAcc(result, v1rowmat,v2rowmat);
00837 }
00838
return result;
00839 }
00840
00841 Vec transposeProduct(
VMat m1,
Vec v2)
00842 {
00843
if(m1.
length()!=v2.
length())
00844
PLERROR(
"in Mat transposeProduct(VMat m1, Vec v2) arguments have incompatible dimensions");
00845
00846
Vec result(m1.
width(),1);
00847 result.
clear();
00848
00849
Vec v1(m1.
width());
00850
for(
int i=0; i<m1.
length(); i++)
00851 {
00852 m1->getRow(i,v1);
00853 result += v1 * v2[i];
00854 }
00855
return result;
00856 }
00857
00858 Mat productTranspose(
VMat m1,
VMat m2)
00859 {
00860
if(m1.
width()!=m2.
width())
00861
PLERROR(
"in Mat productTranspose(VMat m1, VMat m2) arguments have incompatible dimensions");
00862
00863
int m1l = (m1.
length());
00864
int m2l = (m2.
length());
00865
int w = (m1.
width());
00866
Mat result(m1l,m2l);
00867
00868
Vec v1(w);
00869
Vec v2(w);
00870
00871
for(
int i=0; i<m1l; i++)
00872 {
00873 m1->getRow(i,v1);
00874
for(
int j=0; j<m2l; j++)
00875 {
00876 m2->getRow(j,v2);
00877 result(i,j) =
dot(v1,v2);
00878 }
00879 }
00880
return result;
00881 }
00882
00883 Mat product(
Mat m1,
VMat m2)
00884 {
00885
if(m1.
width()!=m2.
length())
00886
PLERROR(
"in Mat product(VMat m1, VMat m2) arguments have incompatible dimensions");
00887
00888
Mat result(m1.
length(),m2.
width());
00889 result.
clear();
00890
00891
Vec v2(m2.
width());
00892
Mat v2rowmat =
rowmatrix(v2);
00893
00894
for(
int i=0; i<m1.
width(); i++)
00895 {
00896 m2->getRow(i,v2);
00897
productAcc(result, m1.
column(i), v2rowmat);
00898 }
00899
return result;
00900 }
00901
00902 VMat transpose(
VMat m1)
00903 {
00904
return VMat(
transpose(m1.
toMat()));
00905 }
00906
00907 real linearRegression(
VMat inputs,
VMat outputs,
real weight_decay,
Mat theta_t,
00908
bool use_precomputed_XtX_XtY,
Mat XtX,
Mat XtY,
real& sum_squared_Y,
00909
bool return_squared_loss,
int verbose_every,
bool cholesky)
00910 {
00911
if (outputs.
length()!=inputs.
length())
00912
PLERROR(
"linearRegression: inputs.length()=%d while outputs.length()=%d",inputs.
length(),outputs.
length());
00913
if (theta_t.
length()!=inputs.
width()+1 || theta_t.
width()!=outputs.
width())
00914
PLERROR(
"linearRegression: theta_t(%d,%d) should be (%dx%d)",
00915 theta_t.
length(),theta_t.
width(),inputs.
width()+1,outputs.
width());
00916
00917
int inputsize = inputs.
width();
00918
int targetsize = outputs.
width();
00919
00920
if(XtX.
length()!=inputsize+1 || XtX.
width()!=inputsize+1)
00921
PLERROR(
"In linearRegression: XtX should have dimensions %dx%d (inputs.width()+1)x(inputs.width()+1)",inputsize+1,inputsize+1);
00922
if(XtY.
length()!=inputsize+1 || XtY.
width()!=targetsize)
00923
PLERROR(
"In linearRegression: XtY should have dimensions %dx%d (inputs.width()+1)x(outputs.width())",inputsize+1,targetsize);
00924
00925
if(!use_precomputed_XtX_XtY)
00926 {
00927
VMat X =
new ExtendedVMatrix(inputs,0,0,1,0,1.0);
00928
VMat Y = outputs;
00929
00930
00931
00932
00933 XtX.
clear();
00934 XtY.
clear();
00935 sum_squared_Y=0;
00936
Vec x(X.
width());
00937
Vec y(Y.
width());
00938
int l=X.
length();
00939
for(
int i=0; i<l; i++)
00940 {
00941 X->getRow(i,
x);
00942 Y->getRow(i,y);
00943
externalProductAcc(XtX,
x,
x);
00944
externalProductAcc(XtY,
x,y);
00945 sum_squared_Y +=
dot(y,y);
00946 }
00947
00948 }
00949
00950
00951
for (
int i=1; i<XtX.
length(); i++)
00952 XtX(i,i) += weight_decay;
00953
00954
if (cholesky) {
00955
00956
solveLinearSystemByCholesky(XtX,XtY,theta_t);
00957 }
else {
00958 theta_t =
solveLinearSystem(XtX, XtY);
00959 }
00960
00961
real squared_loss=0;
00962
if (return_squared_loss)
00963 {
00964
00965
Mat M(inputsize+1,targetsize);
00966
product(M,XtX,theta_t);
00967 squared_loss +=
dot(M,theta_t);
00968 squared_loss += sum_squared_Y;
00969 squared_loss -= 2*
dot(XtY,theta_t);
00970 }
00971
return squared_loss;
00972 }
00973
00974 Mat linearRegression(
VMat inputs,
VMat outputs,
real weight_decay)
00975 {
00976
int n = inputs.
width()+1;
00977
int n_outputs = outputs.
width();
00978
Mat XtX(n,n);
00979
Mat XtY(n,n_outputs);
00980
Mat theta_t(n,n_outputs);
00981
real sy=0;
00982
linearRegression(inputs, outputs, weight_decay, theta_t,
false, XtX, XtY,sy);
00983
return theta_t;
00984 }
00985
00986
00987 real weightedLinearRegression(
VMat inputs,
VMat outputs,
VMat gammas,
real weight_decay,
Mat theta_t,
00988
bool use_precomputed_XtX_XtY,
Mat XtX,
Mat XtY,
real& sum_squared_Y,
00989
real& sum_gammas,
bool return_squared_loss,
int verbose_every,
00990
bool cholesky)
00991 {
00992
int inputsize = inputs.
width();
00993
int targetsize = outputs.
width();
00994
if (outputs.
length()!=inputs.
length())
00995
PLERROR(
"linearRegression: inputs.length()=%d while outputs.length()=%d",inputs.
length(),outputs.
length());
00996
if (theta_t.
length()!=inputsize+1 || theta_t.
width()!=targetsize)
00997
PLERROR(
"linearRegression: theta_t(%d,%d) should be (%dx%d)",
00998 theta_t.
length(),theta_t.
width(),inputsize+1,targetsize);
00999
01000
int l=inputs.
length();
01001
if(!use_precomputed_XtX_XtY)
01002 {
01003 XtX.
clear();
01004 XtY.
clear();
01005
VMat X =
new ExtendedVMatrix(inputs,0,0,1,0,1.0);
01006
VMat Y = outputs;
01007
01008
01009
Vec x(X.
width());
01010
Vec y(Y.
width());
01011
real gamma_i;
01012
for(
int i=0; i<l; i++)
01013 {
01014 X->getRow(i,
x);
01015 Y->getRow(i,y);
01016 gamma_i = gammas(i,0);
01017
externalProductScaleAcc(XtX,
x,
x,gamma_i);
01018
externalProductScaleAcc(XtY,
x,y,gamma_i);
01019 sum_squared_Y += gamma_i *
dot(y,y);
01020 sum_gammas += gamma_i;
01021 }
01022 }
01023
01024
01025
for (
int i=1; i<XtX.
length(); i++)
01026 XtX(i,i) += weight_decay;
01027
01028
if (cholesky) {
01029
01030
solveLinearSystemByCholesky(XtX,XtY,theta_t);
01031 }
else {
01032 theta_t =
solveLinearSystem(XtX, XtY);
01033 }
01034
01035
real squared_loss=0;
01036
if (return_squared_loss)
01037 {
01038
01039
Mat M(inputsize+1,targetsize);
01040
product(M,XtX,theta_t);
01041 squared_loss +=
dot(M,theta_t);
01042 squared_loss += sum_squared_Y;
01043 squared_loss -= 2*
dot(XtY,theta_t);
01044 }
01045
return squared_loss/l;
01046 }
01047
01049 Mat weightedLinearRegression(
VMat inputs,
VMat outputs,
VMat gammas,
real weight_decay)
01050 {
01051
int n = inputs.
width()+1;
01052
int n_outputs = outputs.
width();
01053
Mat XtX(n,n);
01054
Mat XtY(n,n_outputs);
01055
Mat theta_t(n,n_outputs);
01056
real sy=0;
01057
real sg=0;
01058
weightedLinearRegression(inputs, outputs, gammas, weight_decay, theta_t,
false, XtX, XtY,sy,sg);
01059
return theta_t;
01060 }
01061
01062
01063 VMat rebalanceNClasses(
VMat inputs,
int nclasses,
const string& filename)
01064 {
01065
if (!
file_exists(filename))
01066 {
01067
IntVecFile indices(filename,
true);
01068
Vec last = inputs.
lastColumn()->
toMat().
toVecCopy();
01069
const int len = last.
length();
01070
Vec capacity(nclasses);
01071
Array<Vec> index(nclasses);
01072
Array<Vec> index_used(nclasses);
01073
for (
int i=0; i<nclasses; i++) index[i].
resize(len);
01074
for (
int i=0; i<nclasses; i++) index_used[i].
resize(len);
01075
real** p_index;
01076 p_index =
new real*[nclasses];
01077
for (
int i=0; i<nclasses; i++) p_index[i] = index[i].
data();
01078
for (
int i=0; i<nclasses; i++) index_used[i].
clear();
01079
for (
int i=0; i<len; i++)
01080 {
01081
int class_i =
int(last[i]);
01082 *p_index[class_i]++ = i;
01083 capacity[class_i]++;
01084 }
01085
for (
int i=0; i<nclasses; i++) index[i].
resize(
int(capacity[i]));
01086
for (
int i=0; i<nclasses; i++) index_used[i].
resize(
int(capacity[i]));
01087
01088
Mat class_length(nclasses,2);
01089
for (
int i=0; i<nclasses; i++)
01090 {
01091 class_length(i,0) = capacity[i];
01092 class_length(i,1) = i;
01093 }
01094
sortRows(class_length);
01095
Vec remap = class_length.
column(1).toVecCopy();
01096
01097
vector<int> n(nclasses,0);
01098
int new_index = -1;
01099
for (
int i=0; i<len; i++)
01100 {
01101
int c = i%nclasses;
01102
int c_map =
int(remap[c]);
01103
if (c == 0)
01104 {
01105
if (n[0] == capacity[c_map]) n[0] = 0;
01106 new_index = int(index[c_map][n[0]++]);
01107 }
01108
else
01109 {
01110
if (n[c] == capacity[c_map])
01111 {
01112 n[c] = 0;
01113 index_used[c_map].
clear();
01114 }
01115
bool index_found =
false;
01116
int start_pos =
binary_search(index[c_map], real(new_index));
01117
for (
int j=start_pos+1; j<capacity[c_map]; j++)
01118 {
01119
if (index_used[c_map][j] == 0)
01120 {
01121 index_used[c_map][j] = 1;
01122 new_index = int(index[c_map][j]);
01123 index_found =
true;
01124 n[c]++;
01125
break;
01126 }
01127 }
01128
if (!index_found)
01129 {
01130
for (
int j=0; j<start_pos; j++)
01131 {
01132
if (index_used[c_map][j] == 0)
01133 {
01134 index_used[c_map][j] = 1;
01135 new_index = int(index[c_map][j]);
01136 index_found =
true;
01137 n[c]++;
01138
break;
01139 }
01140 }
01141 }
01142
if (!index_found)
01143
PLERROR(
"In rebalanceNClasses: something got wrong!");
01144 }
01145 indices.
put(i, new_index);
01146 }
01147
01148
delete[] p_index;
01149 }
01150
return inputs.
rows(filename);
01151 }
01152
01153 void fullyRebalance2Classes(
VMat inputs,
const string& filename,
bool save_indices)
01154 {
01155
if (!
file_exists(filename))
01156 {
01157
int len = inputs.
length();
01158
01159
int n_zeros = 0;
01160
int n_ones = 0;
01161
Vec zeros(len);
01162
Vec ones(len);
01163
01164
Vec last = inputs.
lastColumn()->
toMat().
toVecCopy();
01165
for (
int i=0; i<len;i++)
01166 {
01167
if (last[i] == 0)
01168 zeros[n_zeros++] = i;
01169
else
01170 ones[n_ones++] = i;
01171 }
01172 zeros.
resize(n_zeros);
01173 ones.
resize(n_ones);
01174
01175
TmpFilenames tmpfile(1);
01176
string fname = save_indices ? filename : tmpfile.
addFilename();
01177
IntVecFile indices(
fname,
true);
01178
int max_symbols =
MAX(n_zeros, n_ones);
01179
for (
int i=0; i<max_symbols; i++)
01180 {
01181 indices.
put(2*i,
int(zeros[i%n_zeros]));
01182 indices.
put(2*i+1,
int(ones[i%n_ones]));
01183 }
01184
if (!save_indices)
01185 {
01186
VMat vm = inputs.
rows(
fname);
01187 vm.
save(filename);
01188 }
01189 }
01190 }
01191
01192 VMat temporalThreshold(
VMat distr,
int threshold_date,
bool is_before,
01193
int yyyymmdd_col)
01194 {
01195
Vec indices(distr->
length());
01196
int n_data = 0;
01197
for (
int i=0; i<distr->
length(); i++)
01198 {
01199
int reference_date = (
int)distr(i, yyyymmdd_col);
01200
if (is_before ? reference_date<=threshold_date : reference_date>=threshold_date)
01201 indices[n_data++] = i;
01202 }
01203 indices.
resize(n_data);
01204
01205
return distr.
rows(indices);
01206 }
01207
01208 VMat temporalThreshold(
VMat distr,
int threshold_date,
bool is_before,
01209
int yyyy_col,
int mm_col,
int dd_col)
01210 {
01211
Vec indices(distr->
length());
01212
int n_data = 0;
01213
for (
int i=0; i<distr->
length(); i++)
01214 {
01215
int reference_date = 10000*(
int)distr(i, yyyy_col) + 100*(
int)distr(i, mm_col) + (
int)distr(i, dd_col);
01216
if (is_before ? reference_date<=threshold_date : reference_date>=threshold_date)
01217 indices[n_data++] = i;
01218 }
01219 indices.
resize(n_data);
01220
01221
return distr.
rows(indices);
01222 }
01223
01228 void correlations(
const VMat& x,
const VMat& y,
Mat& r,
Mat& pvalues)
01229 {
01230
int n=
x.length();
01231
if (n!=y.
length())
01232
PLERROR(
"correlations: x and y must have the same length");
01233
int wx=
x.width();
01234
int wy=y.
width();
01235 r.
resize(wx,wy);
01236 r.
clear();
01237
Mat sxy(wx,wy);
01238
Vec sx2(wx);
01239
Vec sy2(wx);
01240
Vec sx(wx);
01241
Vec sy(wx);
01242
Vec xt(wx);
01243
Vec yt(wy);
01244
for (
int t=0;t<n;t++)
01245 {
01246
x->getRow(t,xt);
01247 y->getRow(t,yt);
01248
for (
int j=0;j<wy;j++)
01249 {
01250
real ytj = yt[j];
01251 sy[j] += ytj;
01252 sy2[j] += ytj*ytj;
01253
for (
int i=0;i<wx;i++)
01254 {
01255
real xti = xt[i];
01256 sxy(i,j) += xti*ytj;
01257 sx[i] += xti;
01258 sx2[i] += xti*xti;
01259 }
01260 }
01261 }
01262
for (
int i=0;i<wx;i++)
01263
for (
int j=0;j<wy;j++)
01264 {
01265
real nv = (sx2[i] - sx[i]/n*sx[i]);
01266
if (nv>0)
01267 r(i,j) = (n*sxy(i,j)-sx[i]*sy[j])/
sqrt((n*sx2[i]-sx[i]*sx[i])*(n*sy2[j]-sy[j]*sy[j]));
01268
else
01269 r(i,j) = 0;
01270
if (r(i,j)<-1.01 || r(i,j)>1.01)
01271
PLWARNING(
"correlation: weird correlation coefficient, %f for %d-th input, %d-target",
01272 r(i,j),i,j);
01273 }
01274 pvalues.
resize(r.
length(),r.
width());
01275
for (
int i=0;i<r.
length();i++)
01276
for (
int j=0;j<r.
width();j++)
01277 pvalues(i,j) =
testNoCorrelationAsymptotically(r(i,j),n);
01278
01279 }
01280
01281
01282 void computeNearestNeighbors(
VMat dataset,
Vec x,
TVec<int>& neighbors,
int ignore_row)
01283 {
01284
int K = neighbors.
length();
01285
BottomNI<real> neighbs(K);
01286
Vec row(dataset->
width());
01287
for(
int i=0; i<dataset->
length(); i++)
01288
if(i!=ignore_row)
01289 {
01290 dataset->getRow(i,row);
01291 neighbs.
update(
powdistance(row,
x), i);
01292 }
01293 neighbs.
sort();
01294
TVec< pair<real,int> > indices = neighbs.
getBottomN();
01295
int nonzero=0;
01296
for(
int k=0;
k<K;
k++)
01297 {
01298
if(indices[
k].
first>0)
01299 nonzero++;
01300 neighbors[
k] = indices[
k].second;
01301 }
01302
if(nonzero==0)
01303
PLERROR(
"All neighbors had 0 distance. Use more neighbors. (There were %i other patterns with same values)",neighbs.
nZeros());
01304 }
01305
01306
01307 }