00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00043
#include "stats_utils.h"
00044
#include "TMat_maths.h"
00045
#include "pl_erf.h"
00046
#include "random.h"
00047
00048
namespace PLearn {
00049
using namespace std;
00050
00062 void SpearmanRankCorrelation(
const VMat &x,
const VMat& y,
Mat& r)
00063 {
00064
int n=
x.length();
00065
if (n!=y.
length())
00066
PLERROR(
"SpearmanRankCorrelation: x and y must have the same length");
00067
int wx=
x.width();
00068
int wy=y.
width();
00069 r.
resize(wx,wy);
00070 r.
clear();
00071
Mat y_ranks;
00072
computeRanks(y.
toMat(),y_ranks);
00073
Mat x_rank(n,1);
00074
00075
real rank_normalization = 12.0/(n*(n-1.0)*(n-2.0));
00076
real half = n*0.5;
00077
for (
int i=0;i<wx;i++)
00078 {
00079
Mat xi=
x.column(i).toMat();
00080
00081 cout <<
".";
00082
computeRanks(xi,x_rank);
00083
00084
Vec r_i = r(i);
00085
for (
int k=0;
k<n;
k++)
00086
for (
int j=0;j<wy;j++)
00087 {
00088
00089
00090 r_i[j] += (x_rank(
k,0) - half) * (y_ranks(
k,j)-half) * rank_normalization;
00091 }
00092
for (
int j=0;j<wy;j++)
00093
if (r_i[j]<-1.01 || r_i[j]>1.01)
00094
PLWARNING(
"SpearmanRankCorrelation: weird correlation coefficient, %f for %d-th input, %d-target",
00095 r_i[j],i,j);
00096 }
00097 cout <<
endl;
00098 }
00099
00105 real testNoCorrelationAsymptotically(
real r,
int n)
00106 {
00107
real fz = fabs(r)*
sqrt(n-1.0);
00108
return (1-
gauss_01_cum(fz)) +
gauss_01_cum(-fz);
00109 }
00110
00122 void testSpearmanRankCorrelationPValues(
const VMat &x,
const VMat& y,
Mat& pvalues)
00123 {
00124
Mat r;
00125
testSpearmanRankCorrelation(
x,y,r,pvalues);
00126 }
00127
00128 void testSpearmanRankCorrelation(
const VMat &x,
const VMat& y,
Mat& r,
Mat& pvalues)
00129 {
00130
SpearmanRankCorrelation(
x,y,r);
00131
int n=
x.length();
00132 pvalues.
resize(r.
length(),r.
width());
00133
for (
int i=0;i<r.
length();i++)
00134
for (
int j=0;j<r.
width();j++)
00135 pvalues(i,j) =
testNoCorrelationAsymptotically(r(i,j),n);
00136 }
00137
00138
00140 real max_cdf_diff(
Vec& v1,
Vec& v2)
00141 {
00142
int n1 = v1.
length();
00143
int n2 = v2.
length();
00144
real inv_n1 = 1./n1;
00145
real inv_n2 = 1./n2;
00146
sortElements(v1);
00147
sortElements(v2);
00148
int i1=0;
00149
int i2=0;
00150
real maxdiff = 0;
00151
00152
for(;;)
00153 {
00154
00155
if(v1[i1]<v2[i2])
00156 {
00157 i1++;
00158
if(i1+1==n1)
00159
break;
00160 }
00161
else
00162 {
00163 i2++;
00164
if(i2==n2)
00165
break;
00166 }
00167
00168
if ((i1>0 && v1[i1]==v1[i1-1]) ||
00169 (i2>0 && v2[i2]==v2[i2-1]) ||
00170 (v1[i1]<v2[i2] && v1[i1+1]<v2[i2]))
00171
continue;
00172
00173
real F1 = inv_n1*i1;
00174
real F2 = inv_n2*i2;
00175
real diff = fabs(F1-F2);
00176
if(diff>maxdiff)
00177 maxdiff = diff;
00178
00179
00180 }
00181
00182
return maxdiff;
00183 }
00184
00185
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221 real KS_test(
real D,
real N,
int conv)
00222 {
00223
int k;
00224
real res = 0.0;
00225
real sn =
sqrt((
double)N);
00226
real ks = D*(sn+0.12+0.11/sn);
00227
real ks2 = ks*ks;
00228
for (
k=1;
k<=conv;
k++) {
00229
real x = ((
k % 2) ? 1 : -1) *
exp( -2 * ks2 *
k *
k );
00230
if (
k==conv)
00231 res += 0.5*
x;
00232
else
00233 res +=
x;
00234 }
00235
return 2 * res;
00236 }
00237
00238 void KS_test(
Vec& v1,
Vec& v2,
int conv,
real& D,
real& p_value)
00239 {
00240
int n1 = v1.
length();
00241
int n2 = v2.
length();
00242
real N = (n1/
real(n1+n2))*n2;
00243 D =
max_cdf_diff(v1, v2);
00244
p_value =
KS_test(D,N,conv);
00245 }
00246
00247 real KS_test(
Vec& v1,
Vec& v2,
int conv)
00248 {
00249
real D, ks_stat;
00250
KS_test(v1,v2,conv,D, ks_stat);
00251
return ks_stat;
00252 }
00253
00254 real paired_t_test(
Vec u,
Vec v)
00255 {
00256
int n = u.
length();
00257
if( v.
length() != n )
00258 {
00259
PLWARNING(
"paired_t_test: "
00260
"Can't make a paired t-test on to unequally lengthed vectors (%d != %d).",
00261 n, v.
length());
00262
return MISSING_VALUE;
00263 }
00264
00265
real ubar =
mean(u);
00266
real vbar =
mean(v);
00267
Vec u2 = u - ubar;
00268
Vec v2 = v - vbar;
00269
00270
return (ubar - vbar) *
sqrt( n*(n-1) /
sumsquare(u2-v2));
00271 }
00272
00273 }
00274
00275
00276
00277
00278
00279
00280
00281
00282
00283
00284
00285
00286
00287
00288
00289
00290
00291
00292
00293
00294
00295
00296
00297
00298
00299
00300
00301
00302
00303
00304
00305
00306
00307
00308
00309
00310
00311
00312
00313
00314
00315
00316
00317
00318
00319
00320
00321
00322
00323
00324
00325
00326
00327
00328
00329