00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00040
#include "TestDependenciesCommand.h"
00041
#include <plearn/db/getDataSet.h>
00042
#include <plearn/math/stats_utils.h>
00043
#include <plearn/vmat/VMat_maths.h>
00044
00045
00046
#ifdef WIN32
00047
#include <windows.h>
00048
00049
#undef min
00050
#undef max
00051
#else
00052
#include <plearn/sys/procinfo.h>
00053
#endif
00054
00055
namespace PLearn {
00056
using namespace std;
00057
00059
PLearnCommandRegistry TestDependenciesCommand::reg_(
new TestDependenciesCommand);
00060
00062 void TestDependenciesCommand::run(
const vector<string>& args)
00063 {
00064
if(args.size()<1 || args.size()>4)
00065
PLERROR(
"test-dependencies expects 1 to 4 arguments, check the help");
00066
00067
VMat data =
getDataSet(args[0]);
00068
int inputsize = (args.size()>1)?
toint(args[1]):data->inputsize();
00069
int targetsize = (args.size()>2)?
toint(args[2]):data->targetsize();
00070
int row_blocksize = (args.size()>3)?
toint(args[3]):data.
length();
00071
if (args.size()>1)
00072 data->defineSizes(inputsize,targetsize,data->weightsize());
00073
00074
#ifdef WIN32
00075
MEMORYSTATUS stat;
00076 GlobalMemoryStatus (&stat);
00077
00078
int memory_size = (
int)stat.dwAvailVirtual;
00079
#else
00080
int memory_size =
getSystemTotalMemory();
00081
#endif
00082
int n_rowblocks =
int(ceil(data.
length() /
real(row_blocksize)));
00083
00084
00085
00086
00087
00088
00089
00090
00091
Mat var_rank(n_rowblocks,inputsize);
00092
Mat var_score(n_rowblocks,inputsize);
00093
Mat var_rank_corr(n_rowblocks,inputsize*targetsize);
00094
Mat var_rc_pvalue(n_rowblocks,inputsize*targetsize);
00095
Mat var_lin_corr(n_rowblocks,inputsize*targetsize);
00096
Mat var_lc_pvalue(n_rowblocks,inputsize*targetsize);
00097
int rowblockstart = 0;
00098
int n=data->
length();
00099
00100
for (
int rowblock=0;rowblock<n_rowblocks;rowblock++, rowblockstart += row_blocksize)
00101 {
00102
int rowblocklen = (rowblock<n_rowblocks-1)?row_blocksize:(n-rowblockstart);
00103
VMat x = data.
subMat(rowblockstart,0,rowblocklen,inputsize);
00104
VMat y = data.
subMat(rowblockstart,inputsize,rowblocklen,targetsize);
00105
Mat r = var_rank_corr(rowblock).toMat(inputsize,targetsize);
00106
Mat pvalues = var_rc_pvalue(rowblock).toMat(inputsize,targetsize);
00107
int col_blocksize = memory_size/(2*
sizeof(
real)*rowblocklen);
00108
if (col_blocksize>=inputsize)
00109 {
00110
x =
VMat(
x.toMat());
00111
testSpearmanRankCorrelation(
x,y,r,pvalues);
00112 }
00113
else
00114 {
00115
int n_col_blocks = int(ceil(inputsize/
real(col_blocksize)));
00116 cout <<
"work with " << n_col_blocks <<
" of " << col_blocksize <<
" columns each (except the last)." <<
endl;
00117
int bstart=0;
00118
for (
int b=0;b<n_col_blocks;b++,bstart+=col_blocksize)
00119 {
00120
int bsize= (b<n_col_blocks-1)?col_blocksize:inputsize-bstart;
00121
VMat block =
VMat(
x.subMatColumns(bstart,bsize).toMat());
00122
Mat rb = r.
subMatRows(bstart,bsize);
00123
Mat pb = pvalues.
subMatRows(bstart,bsize);
00124 cout <<
"compute rank correlation for variables " << bstart <<
" - " << bstart+bsize-1 <<
endl;
00125
testSpearmanRankCorrelation(block,y,rb,pb);
00126 }
00127 }
00128
00129
Mat lr = var_lin_corr(rowblock).toMat(inputsize,targetsize);
00130
Mat lpvalues = var_lc_pvalue(rowblock).toMat(inputsize,targetsize);
00131
correlations(
x,y,lr,lpvalues);
00132
Mat scores(inputsize,2);
00133
for (
int i=0;i<inputsize;i++)
00134 {
00135
Vec r_i = r(i);
00136
real s =0;
00137
for (
int j=0;j<targetsize;j++)
00138 {
00139
real abs_r = fabs(r_i[j]);
00140
if (abs_r>s) s=abs_r;
00141 }
00142 scores(i,0) = s;
00143 scores(i,1) = i;
00144 }
00145
sortRows(scores,0,
false);
00146 cout <<
"Results for " << rowblock <<
"-th row block, from row " << rowblockstart <<
" to " << rowblockstart+rowblocklen-1 <<
" inclusively" <<
endl;
00147
for (
int k=0;
k<inputsize;
k++)
00148 {
00149
int i = int(scores(
k,1));
00150 var_rank(rowblock,i) =
k;
00151 var_score(rowblock,i) = scores(
k,0);
00152 cout <<
k <<
"-th best variable is " << data->
fieldName(i) <<
" (col. " << i <<
")";
00153
if (targetsize==1)
00154 cout <<
" with rank correlation = " << r(i,0) <<
" {p-value = " << pvalues(i,0)
00155 <<
"}, linear corr. = "
00156 << lr(i,0)
00157 <<
" {p-value= " << lpvalues(i,0) <<
"}" <<
endl;
00158
if (targetsize>1)
00159 {
00160 cout <<
" (rank corr., rank p-value, lin. corr., lin. p-value) for individual targets: ";
00161
for (
int j=0;j<targetsize;j++)
00162 cout <<
"(" << r(i,j) <<
", " << pvalues(i,j) <<
"," << lr(i,j) <<
", "
00163 << lpvalues(i,j) <<
") ";
00164 cout <<
endl;
00165 }
00166 }
00167 }
00168
00169
Mat mean_score(inputsize,2);
00170
for (
int i=0;i<inputsize;i++)
00171 {
00172 mean_score(i,0) =
mean(var_score.
column(i));
00173 mean_score(i,1) = i;
00174 }
00175
sortRows(mean_score,0,
false);
00176
00177 cout <<
"For each block statistic print (mean,stdev,min,max)\n" <<
endl;
00178
for (
int k=0;
k<inputsize;
k++)
00179 {
00180
int i = int(mean_score(
k,1));
00181
Mat varrank = var_rank.
column(i);
00182
Mat varscore = var_score.
column(i);
00183
Mat varrc = var_rank_corr.
column(i);
00184
Mat varrcpv = var_rc_pvalue.
column(i);
00185
Mat varlc = var_lin_corr.
column(i);
00186
Mat varlcpv = var_lc_pvalue.
column(i);
00187
Vec rankm(1),rankdev(1),scorem(1),scoredev(1),rcm(1),rcdev(1),rcpvm(1),rcpvdev(1),
00188 lcm(1),lcdev(1),lcpvm(1),lcpvdev(1);
00189
computeMeanAndStddev(varrank,rankm,rankdev);
00190
computeMeanAndStddev(varscore,scorem,scoredev);
00191
computeMeanAndStddev(varrc,rcm,rcdev);
00192
computeMeanAndStddev(varrcpv,rcpvm,rcpvdev);
00193
computeMeanAndStddev(varlc,lcm,lcdev);
00194
computeMeanAndStddev(varlcpv,lcpvm,lcpvdev);
00195 cout <<
k <<
"-th best variable is " << data->
fieldName(i) <<
" (col. " << i <<
")";
00196
if (targetsize==1)
00197 {
00198 cout <<
" rank corr (" << rcm[0] <<
"," << rcdev[0] <<
"," <<
min(varrc) <<
"," <<
max(varrc) <<
" ) ";
00199 cout <<
" var rank (" << rankm[0] <<
"," << rankdev[0] <<
"," <<
min(varrank) <<
"," <<
max(varrank) <<
" ) ";
00200 cout <<
" rank cor pval(" << rcpvm[0] <<
"," << rcpvdev[0] <<
"," <<
min(varrcpv) <<
"," <<
max(varrcpv) <<
" ) ";
00201 cout <<
" lin corr (" << lcm[0] <<
"," << lcdev[0] <<
"," <<
min(varlc) <<
"," <<
max(varlc) <<
" ) ";
00202 cout <<
" lin cor pval (" << lcpvm[0] <<
"," << lcpvdev[0] <<
"," <<
min(varlcpv) <<
"," <<
max(varlcpv) <<
" ) " <<
endl;
00203 }
00204
else PLWARNING(
"not yet implemented");
00205 }
00206 }
00207
00208 }
00209