Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

TestDependenciesCommand.cc

Go to the documentation of this file.
00001 // -*- C++ -*- 00002 00003 // TestDependenciesCommand.cc 00004 // 00005 // Copyright (C) 2003 Pascal Vincent 00006 // 00007 // Redistribution and use in source and binary forms, with or without 00008 // modification, are permitted provided that the following conditions are met: 00009 // 00010 // 1. Redistributions of source code must retain the above copyright 00011 // notice, this list of conditions and the following disclaimer. 00012 // 00013 // 2. Redistributions in binary form must reproduce the above copyright 00014 // notice, this list of conditions and the following disclaimer in the 00015 // documentation and/or other materials provided with the distribution. 00016 // 00017 // 3. The name of the authors may not be used to endorse or promote 00018 // products derived from this software without specific prior written 00019 // permission. 00020 // 00021 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00022 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00023 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00024 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00025 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00026 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00027 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00028 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00029 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00030 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00031 // 00032 // This file is part of the PLearn library. For more information on the PLearn 00033 // library, go to the PLearn Web site at www.plearn.org 00034 00035 /* ******************************************************* 00036 * $Id: TestDependenciesCommand.cc,v 1.8 2004/07/21 16:30:49 chrish42 Exp $ 00037 ******************************************************* */ 00038 00040 #include "TestDependenciesCommand.h" 00041 #include <plearn/db/getDataSet.h> 00042 #include <plearn/math/stats_utils.h> 00043 #include <plearn/vmat/VMat_maths.h> 00044 00045 // norman: sorry, no memory check yet! 00046 #ifdef WIN32 00047 #include <windows.h> 00048 // undef min and max macros to avoid conflict with the plearn min and max 00049 #undef min 00050 #undef max 00051 #else 00052 #include <plearn/sys/procinfo.h> 00053 #endif 00054 00055 namespace PLearn { 00056 using namespace std; 00057 00059 PLearnCommandRegistry TestDependenciesCommand::reg_(new TestDependenciesCommand); 00060 00062 void TestDependenciesCommand::run(const vector<string>& args) 00063 { 00064 if(args.size()<1 || args.size()>4) 00065 PLERROR("test-dependencies expects 1 to 4 arguments, check the help"); 00066 00067 VMat data = getDataSet(args[0]); 00068 int inputsize = (args.size()>1)?toint(args[1]):data->inputsize(); 00069 int targetsize = (args.size()>2)?toint(args[2]):data->targetsize(); 00070 int row_blocksize = (args.size()>3)?toint(args[3]):data.length(); 00071 if (args.size()>1) 00072 data->defineSizes(inputsize,targetsize,data->weightsize()); 00073 00074 #ifdef WIN32 00075 MEMORYSTATUS stat; 00076 GlobalMemoryStatus (&stat); 00077 // Total available memory in bytes 00078 int memory_size = (int)stat.dwAvailVirtual; 00079 #else 00080 int memory_size = getSystemTotalMemory(); 00081 #endif 00082 int n_rowblocks = int(ceil(data.length() / real(row_blocksize))); 00083 00084 // statistics computed for each variable, and for each rowblock 00085 // rank in "bestness" 00086 // score in "bestness" 00087 // rank correlation 00088 // rank correlation p-value 00089 // linear correlation 00090 // linear correlation p-value 00091 Mat var_rank(n_rowblocks,inputsize); 00092 Mat var_score(n_rowblocks,inputsize); 00093 Mat var_rank_corr(n_rowblocks,inputsize*targetsize); 00094 Mat var_rc_pvalue(n_rowblocks,inputsize*targetsize); 00095 Mat var_lin_corr(n_rowblocks,inputsize*targetsize); 00096 Mat var_lc_pvalue(n_rowblocks,inputsize*targetsize); 00097 int rowblockstart = 0; 00098 int n=data->length(); 00099 00100 for (int rowblock=0;rowblock<n_rowblocks;rowblock++, rowblockstart += row_blocksize) 00101 { 00102 int rowblocklen = (rowblock<n_rowblocks-1)?row_blocksize:(n-rowblockstart); 00103 VMat x = data.subMat(rowblockstart,0,rowblocklen,inputsize); 00104 VMat y = data.subMat(rowblockstart,inputsize,rowblocklen,targetsize); 00105 Mat r = var_rank_corr(rowblock).toMat(inputsize,targetsize); 00106 Mat pvalues = var_rc_pvalue(rowblock).toMat(inputsize,targetsize); 00107 int col_blocksize = memory_size/(2*sizeof(real)*rowblocklen); 00108 if (col_blocksize>=inputsize) // everything fits in half the memory 00109 { 00110 x = VMat(x.toMat()); 00111 testSpearmanRankCorrelation(x,y,r,pvalues); 00112 } 00113 else // work by column blocks 00114 { 00115 int n_col_blocks = int(ceil(inputsize/real(col_blocksize))); 00116 cout << "work with " << n_col_blocks << " of " << col_blocksize << " columns each (except the last)." << endl; 00117 int bstart=0; 00118 for (int b=0;b<n_col_blocks;b++,bstart+=col_blocksize) 00119 { 00120 int bsize= (b<n_col_blocks-1)?col_blocksize:inputsize-bstart; 00121 VMat block = VMat(x.subMatColumns(bstart,bsize).toMat()); 00122 Mat rb = r.subMatRows(bstart,bsize); 00123 Mat pb = pvalues.subMatRows(bstart,bsize); 00124 cout << "compute rank correlation for variables " << bstart << " - " << bstart+bsize-1 << endl; 00125 testSpearmanRankCorrelation(block,y,rb,pb); 00126 } 00127 } 00128 // linear correlations and corresponding p-values 00129 Mat lr = var_lin_corr(rowblock).toMat(inputsize,targetsize); 00130 Mat lpvalues = var_lc_pvalue(rowblock).toMat(inputsize,targetsize); 00131 correlations(x,y,lr,lpvalues); 00132 Mat scores(inputsize,2); 00133 for (int i=0;i<inputsize;i++) 00134 { 00135 Vec r_i = r(i); 00136 real s =0; 00137 for (int j=0;j<targetsize;j++) 00138 { 00139 real abs_r = fabs(r_i[j]); 00140 if (abs_r>s) s=abs_r; 00141 } 00142 scores(i,0) = s; 00143 scores(i,1) = i; 00144 } 00145 sortRows(scores,0,false); 00146 cout << "Results for " << rowblock << "-th row block, from row " << rowblockstart << " to " << rowblockstart+rowblocklen-1 << " inclusively" << endl; 00147 for (int k=0;k<inputsize;k++) 00148 { 00149 int i = int(scores(k,1)); 00150 var_rank(rowblock,i) = k; 00151 var_score(rowblock,i) = scores(k,0); 00152 cout << k << "-th best variable is " << data->fieldName(i) << " (col. " << i << ")"; 00153 if (targetsize==1) 00154 cout << " with rank correlation = " << r(i,0) << " {p-value = " << pvalues(i,0) 00155 << "}, linear corr. = " 00156 << lr(i,0) 00157 << " {p-value= " << lpvalues(i,0) << "}" << endl; 00158 if (targetsize>1) 00159 { 00160 cout << " (rank corr., rank p-value, lin. corr., lin. p-value) for individual targets: "; 00161 for (int j=0;j<targetsize;j++) 00162 cout << "(" << r(i,j) << ", " << pvalues(i,j) << "," << lr(i,j) << ", " 00163 << lpvalues(i,j) << ") "; 00164 cout << endl; 00165 } 00166 } 00167 } 00168 // compute mean var_score for each variable and sort them accordingly 00169 Mat mean_score(inputsize,2); 00170 for (int i=0;i<inputsize;i++) 00171 { 00172 mean_score(i,0) = mean(var_score.column(i)); 00173 mean_score(i,1) = i; 00174 } 00175 sortRows(mean_score,0,false); 00176 // compute statistics across row blocks 00177 cout << "For each block statistic print (mean,stdev,min,max)\n" << endl; 00178 for (int k=0;k<inputsize;k++) 00179 { 00180 int i = int(mean_score(k,1)); 00181 Mat varrank = var_rank.column(i); 00182 Mat varscore = var_score.column(i); 00183 Mat varrc = var_rank_corr.column(i); 00184 Mat varrcpv = var_rc_pvalue.column(i); 00185 Mat varlc = var_lin_corr.column(i); 00186 Mat varlcpv = var_lc_pvalue.column(i); 00187 Vec rankm(1),rankdev(1),scorem(1),scoredev(1),rcm(1),rcdev(1),rcpvm(1),rcpvdev(1), 00188 lcm(1),lcdev(1),lcpvm(1),lcpvdev(1); 00189 computeMeanAndStddev(varrank,rankm,rankdev); 00190 computeMeanAndStddev(varscore,scorem,scoredev); 00191 computeMeanAndStddev(varrc,rcm,rcdev); 00192 computeMeanAndStddev(varrcpv,rcpvm,rcpvdev); 00193 computeMeanAndStddev(varlc,lcm,lcdev); 00194 computeMeanAndStddev(varlcpv,lcpvm,lcpvdev); 00195 cout << k << "-th best variable is " << data->fieldName(i) << " (col. " << i << ")"; 00196 if (targetsize==1) 00197 { 00198 cout << " rank corr (" << rcm[0] << "," << rcdev[0] << "," << min(varrc) << "," << max(varrc) << " ) "; 00199 cout << " var rank (" << rankm[0] << "," << rankdev[0] << "," << min(varrank) << "," << max(varrank) << " ) "; 00200 cout << " rank cor pval(" << rcpvm[0] << "," << rcpvdev[0] << "," << min(varrcpv) << "," << max(varrcpv) << " ) "; 00201 cout << " lin corr (" << lcm[0] << "," << lcdev[0] << "," << min(varlc) << "," << max(varlc) << " ) "; 00202 cout << " lin cor pval (" << lcpvm[0] << "," << lcpvdev[0] << "," << min(varlcpv) << "," << max(varlcpv) << " ) " << endl; 00203 } 00204 else PLWARNING("not yet implemented"); 00205 } 00206 } 00207 00208 } // end of namespace PLearn 00209

Generated on Tue Aug 17 16:08:15 2004 for PLearn by doxygen 1.3.7