PLearn: stats_utils.cc Source File

00001 // -*- C++ -*- 00002 00003 // PLearn (A C++ Machine Learning Library) 00004 // Copyright (C) 2003 Pascal Vincent, Yoshua Bengio 00005 00006 // Redistribution and use in source and binary forms, with or without 00007 // modification, are permitted provided that the following conditions are met: 00008 // 00009 // 1. Redistributions of source code must retain the above copyright 00010 // notice, this list of conditions and the following disclaimer. 00011 // 00012 // 2. Redistributions in binary form must reproduce the above copyright 00013 // notice, this list of conditions and the following disclaimer in the 00014 // documentation and/or other materials provided with the distribution. 00015 // 00016 // 3. The name of the authors may not be used to endorse or promote 00017 // products derived from this software without specific prior written 00018 // permission. 00019 // 00020 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00021 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00022 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00023 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00024 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00025 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00026 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00027 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00028 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00029 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00030 // 00031 // This file is part of the PLearn library. For more information on the PLearn 00032 // library, go to the PLearn Web site at www.plearn.org 00033 00034 00035 /* ******************************************************* 00036 * $Id: stats_utils.cc,v 1.10 2004/08/04 13:38:10 tihocan Exp $ 00037 * This file is part of the PLearn library. 00038 ******************************************************* */ 00039 00040 00043 #include "stats_utils.h" 00044 #include "TMat_maths.h" 00045 #include "pl_erf.h" 00046 #include "random.h" 00047 00048 namespace PLearn { 00049 using namespace std; 00050 00062 void SpearmanRankCorrelation(const VMat &x, const VMat& y, Mat& r) 00063 { 00064 int n=x.length(); 00065 if (n!=y.length()) 00066 PLERROR("SpearmanRankCorrelation: x and y must have the same length"); 00067 int wx=x.width(); 00068 int wy=y.width(); 00069 r.resize(wx,wy); 00070 r.clear(); 00071 Mat y_ranks; 00072 computeRanks(y.toMat(),y_ranks); 00073 Mat x_rank(n,1); 00074 //real rank_normalization = sqrt(1.0/(n*n-1.0)); 00075 real rank_normalization = 12.0/(n*(n-1.0)*(n-2.0)); 00076 real half = n*0.5; 00077 for (int i=0;i<wx;i++) 00078 { 00079 Mat xi=x.column(i).toMat(); 00080 // compute the rank of the i-th column of x 00081 cout << "."; 00082 computeRanks(xi,x_rank); 00083 // compute the Spearman rank correlation coefficient: 00084 Vec r_i = r(i); 00085 for (int k=0;k<n;k++) 00086 for (int j=0;j<wy;j++) 00087 { 00088 //real delta = (x_rank(k,0) - y_ranks(k,j))*rank_normalization; 00089 // r_i[j] += delta*delta; 00090 r_i[j] += (x_rank(k,0) - half) * (y_ranks(k,j)-half) * rank_normalization; 00091 } 00092 for (int j=0;j<wy;j++) 00093 if (r_i[j]<-1.01 || r_i[j]>1.01) 00094 PLWARNING("SpearmanRankCorrelation: weird correlation coefficient, %f for %d-th input, %d-target", 00095 r_i[j],i,j); 00096 } 00097 cout << endl; 00098 } 00099 00105 real testNoCorrelationAsymptotically(real r, int n) 00106 { 00107 real fz = fabs(r)*sqrt(n-1.0); 00108 return (1-gauss_01_cum(fz)) + gauss_01_cum(-fz); 00109 } 00110 00122 void testSpearmanRankCorrelationPValues(const VMat &x, const VMat& y, Mat& pvalues) 00123 { 00124 Mat r; 00125 testSpearmanRankCorrelation(x,y,r,pvalues); 00126 } 00127 00128 void testSpearmanRankCorrelation(const VMat &x, const VMat& y, Mat& r, Mat& pvalues) 00129 { 00130 SpearmanRankCorrelation(x,y,r); 00131 int n=x.length(); 00132 pvalues.resize(r.length(),r.width()); 00133 for (int i=0;i<r.length();i++) 00134 for (int j=0;j<r.width();j++) 00135 pvalues(i,j) = testNoCorrelationAsymptotically(r(i,j),n); 00136 } 00137 00138 00140 real max_cdf_diff(Vec& v1, Vec& v2) 00141 { 00142 int n1 = v1.length(); 00143 int n2 = v2.length(); 00144 real inv_n1 = 1./n1; 00145 real inv_n2 = 1./n2; 00146 sortElements(v1); 00147 sortElements(v2); 00148 int i1=0; 00149 int i2=0; 00150 real maxdiff = 0; 00151 00152 for(;;) 00153 { 00154 00155 if(v1[i1]<v2[i2]) 00156 { 00157 i1++; 00158 if(i1+1==n1) 00159 break; 00160 } 00161 else 00162 { 00163 i2++; 00164 if(i2==n2) 00165 break; 00166 } 00167 00168 if ((i1>0 && v1[i1]==v1[i1-1]) || 00169 (i2>0 && v2[i2]==v2[i2-1]) || 00170 (v1[i1]<v2[i2] && v1[i1+1]<v2[i2])) 00171 continue; // to deal with discrete-valued variables: only look at "changing-value" places 00172 00173 real F1 = inv_n1*i1; 00174 real F2 = inv_n2*i2; 00175 real diff = fabs(F1-F2); 00176 if(diff>maxdiff) 00177 maxdiff = diff; 00178 00179 // cerr << "v1[" << i1 << "]=" << v1[i1] << "; v2[" << i2 << "]=" << v2[i2] << "; F1=" << F1 << "; F2=" << F2 << "; diff=" << diff << endl; 00180 } 00181 00182 return maxdiff; 00183 } 00184 00185 00186 /*************************************************************/ 00187 /* 00188 Return the probability that the Kolmogorov-Smirnov statistic 00189 D takes the observed value or greater, given the null hypothesis 00190 that the distributions that are compared are 00191 really identical. N is the effective number of samples 00192 used for comparing the distributions. 00193 The argument conv gives the precision with which 00194 this probability is computed. A value above 10 does not bring 00195 much improvement. Note that the statistic D can 00196 be obtained as follows: 00197 00198 Comparing two empirical distributions from data sets D_1 and D_2: 00199 Let F_1(x) the empirical cumulative distribution of D_1 of size N_1, and 00200 let F_2(x) the empirical cumulative distribution of D_2 of size N_2. Then 00201 00202 D = max_x | F_1(x) - F_2(x) | 00203 00204 and the effective N is N_1 N_2 / (N_1 + N_2). 00205 00206 Comparing a theoretical distribution F and a data set D of size N with 00207 empirical cumulative distribution F_N: 00208 00209 D = max_x | F(x) - F_N(x) | 00210 00211 This function returns the following 00212 00213 P(D > observed d | same distributions) estimated by 00214 2 sum_{k=1}^{infty} (-1)^{k-1} exp(-2k^2 a^2) 00215 00216 where a = sqrt(D*(sqrt(N)+0.12+0.11/sqrt(N))) 00217 00218 Ref: Stephens, M.A. (1970), Journal of the Royal Statistical Society B, vol. 32, pp. 115-122. 00219 00220 */ 00221 real KS_test(real D, real N, int conv) 00222 { 00223 int k; 00224 real res = 0.0; 00225 real sn = sqrt((double)N); 00226 real ks = D*(sn+0.12+0.11/sn); 00227 real ks2 = ks*ks; 00228 for (k=1;k<=conv;k++) { 00229 real x = ((k % 2) ? 1 : -1) * exp( -2 * ks2 * k * k ); 00230 if (k==conv) 00231 res += 0.5*x; 00232 else 00233 res += x; 00234 } 00235 return 2 * res; 00236 } 00237 00238 void KS_test(Vec& v1, Vec& v2, int conv, real& D, real& p_value) 00239 { 00240 int n1 = v1.length(); 00241 int n2 = v2.length(); 00242 real N = (n1/real(n1+n2))*n2; 00243 D = max_cdf_diff(v1, v2); 00244 p_value = KS_test(D,N,conv); 00245 } 00246 00247 real KS_test(Vec& v1, Vec& v2, int conv) 00248 { 00249 real D, ks_stat; 00250 KS_test(v1,v2,conv,D, ks_stat); 00251 return ks_stat; 00252 } 00253 00254 real paired_t_test(Vec u, Vec v) 00255 { 00256 int n = u.length(); 00257 if( v.length() != n ) 00258 { 00259 PLWARNING("paired_t_test: " 00260 "Can't make a paired t-test on to unequally lengthed vectors (%d != %d).", 00261 n, v.length()); 00262 return MISSING_VALUE; 00263 } 00264 00265 real ubar = mean(u); 00266 real vbar = mean(v); 00267 Vec u2 = u - ubar; 00268 Vec v2 = v - vbar; 00269 00270 return (ubar - vbar) * sqrt( n*(n-1) / sumsquare(u2-v2)); 00271 } 00272 00273 } // end of namespace PLearn 00274 00275 /* 00276 00277 // Test code... 00278 00279 #include "random.h" 00280 #include <plearn/display/Gnuplot.h> 00281 00282 using namespace PLearn; 00283 00284 // should plot a uniform distribution 00285 void verify_ks(int n1=1000, int n2=1000, int k=100) 00286 { 00287 Vec v1(n1); 00288 Vec v2(n2); 00289 Vec ks(k); 00290 00291 for(int i=0; i<k; i++) 00292 { 00293 fill_random_normal(v1, 0, 1); 00294 fill_random_normal(v2, 0.1, 1); 00295 // fill_random_uniform(v2, -0.5, 0.5); 00296 00297 ks[i] = KS_test(v1,v2); 00298 cerr << '.'; 00299 } 00300 00301 Gnuplot gp; 00302 gp.plotcdf(ks); 00303 char s[100]; 00304 cin.getline(s, 100); 00305 } 00306 00307 int main() 00308 { 00309 Vec v1(5); 00310 v1 << "1 2 5 9 14"; 00311 00312 Vec v2(6); 00313 v2 << "-1 4 12 14 25 3"; 00314 00315 real md = max_cdf_diff(v1,v2); 00316 00317 int n1 = v1.length(); 00318 int n2 = v2.length(); 00319 00320 cout << md << endl; 00321 cout << KS_test(md, n1*n2/real(n1+n2)) << endl; 00322 00323 verify_ks(); 00324 00325 return 0; 00326 } 00327 00328 */ 00329