PLearn: GaussianProcessRegressor.cc Source File

00001 // -*- C++ -*- 00002 00003 // GaussianProcessRegressor.cc 00004 // 00005 // Copyright (C) 2003 Yoshua Bengio 00006 // 00007 // Redistribution and use in source and binary forms, with or without 00008 // modification, are permitted provided that the following conditions are met: 00009 // 00010 // 1. Redistributions of source code must retain the above copyright 00011 // notice, this list of conditions and the following disclaimer. 00012 // 00013 // 2. Redistributions in binary form must reproduce the above copyright 00014 // notice, this list of conditions and the following disclaimer in the 00015 // documentation and/or other materials provided with the distribution. 00016 // 00017 // 3. The name of the authors may not be used to endorse or promote 00018 // products derived from this software without specific prior written 00019 // permission. 00020 // 00021 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00022 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00023 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00024 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00025 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00026 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00027 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00028 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00029 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00030 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00031 // 00032 // This file is part of the PLearn library. For more information on the PLearn 00033 // library, go to the PLearn Web site at www.plearn.org 00034 00035 00036 00037 00038 /* ******************************************************* 00039 * $Id: GaussianProcessRegressor.cc,v 1.11 2004/07/21 16:30:55 chrish42 Exp $ 00040 ******************************************************* */ 00041 00042 #include "GaussianProcessRegressor.h" 00043 #include <plearn/math/pl_erf.h> 00044 #include <plearn/math/plapack.h> 00045 00046 namespace PLearn { 00047 using namespace std; 00048 00049 GaussianProcessRegressor::GaussianProcessRegressor() : 00050 inherited(), Gram_matrix_normalization("none"), 00051 max_nb_evectors(-1) 00052 {} 00053 00054 PLEARN_IMPLEMENT_OBJECT(GaussianProcessRegressor, "Basic version of Gaussian Process regression.", "NO HELP"); 00055 00056 void GaussianProcessRegressor::makeDeepCopyFromShallowCopy(map<const void*, void*>& copies) 00057 { 00058 inherited::makeDeepCopyFromShallowCopy(copies); 00059 00060 // ### Call deepCopyField on all "pointer-like" fields 00061 // ### that you wish to be deepCopied rather than 00062 // ### shallow-copied. 00063 // ### ex: 00064 deepCopyField(kernel, copies); 00065 deepCopyField(noise_sd, copies); 00066 deepCopyField(alpha, copies); 00067 deepCopyField(Kxxi, copies); 00068 deepCopyField(Kxx, copies); 00069 deepCopyField(K, copies); 00070 deepCopyField(eigenvectors, copies); 00071 deepCopyField(eigenvalues, copies); 00072 deepCopyField(meanK, copies); 00073 } 00074 00075 void GaussianProcessRegressor::setInput(const Vec& input) 00076 { 00077 setInput_const(input); 00078 } 00079 void GaussianProcessRegressor::setInput_const(const Vec& input) const 00080 { 00081 // compute K(x,x_i) 00082 for (int i=0;i<Kxxi.length();i++) 00083 Kxxi[i]=kernel->evaluate_x_i(input,i); 00084 // compute K(x,x) 00085 Kxx = kernel->evaluate(input,input); 00086 // apply normalization 00087 if (Gram_matrix_normalization=="centering_a_dotproduct") 00088 { 00089 real kmean = mean(Kxxi); 00090 for (int i=0;i<Kxxi.length();i++) 00091 Kxxi[i] = Kxxi[i] - kmean - meanK[i] + mean_allK; 00092 Kxx = Kxx - kmean - kmean + mean_allK; 00093 } else if (Gram_matrix_normalization=="centering_a_distance") 00094 { 00095 real kmean = mean(Kxxi); 00096 for (int i=0;i<Kxxi.length();i++) 00097 Kxxi[i] = -0.5*(Kxxi[i] - kmean - meanK[i] + mean_allK); 00098 Kxx = -0.5*(Kxx - kmean - kmean + mean_allK); 00099 } 00100 else if (Gram_matrix_normalization=="divisive") 00101 { 00102 real kmean = mean(Kxxi); 00103 for (int i=0;i<Kxxi.length();i++) 00104 Kxxi[i] = Kxxi[i]/sqrt(kmean* meanK[i]); 00105 Kxx = Kxx/kmean; 00106 } 00107 } 00108 00109 00110 void GaussianProcessRegressor::declareOptions(OptionList& ol) 00111 { 00112 declareOption(ol, "kernel", &GaussianProcessRegressor::kernel, OptionBase::buildoption, 00113 "The kernel (seen as a symmetric, two-argument function of a pair of input points)\n" 00114 "that corresponds to the prior covariance on the function to be learned.\n"); 00115 00116 declareOption(ol, "noise_sd", &GaussianProcessRegressor::noise_sd, OptionBase::buildoption, 00117 "Output noise std. dev. (one element per output).\n"); 00118 00119 00120 declareOption(ol, "max_nb_evectors", &GaussianProcessRegressor::max_nb_evectors, OptionBase::buildoption, 00121 "Maximum number of eigenvectors of the Gram matrix to compute (or -1 if all should be computed).\n"); 00122 00123 00124 declareOption(ol, "Gram_matrix_normalization", &GaussianProcessRegressor::Gram_matrix_normalization, 00125 OptionBase::buildoption, 00126 "normalization method to apply to Gram matrix. Expected values are:\n" 00127 "\"none\": no normalization\n" 00128 "\"centering_a_dot_product\": this is the kernel PCA centering\n" 00129 " K_{ij} <-- K_{ij} - mean_i(K_ij) - mean_j(K_ij) + mean_{ij}(K_ij)\n" 00130 "\"centering_a_distance\": this is the MDS transformation of squared distances to dot products\n" 00131 " K_{ij} <-- -0.5(K_{ij} - mean_i(K_ij) - mean_j(K_ij) + mean_{ij}(K_ij))\n" 00132 "\"divisive\": this is the spectral clustering and Laplacian eigenmaps normalization\n" 00133 " K_{ij} <-- K_{ij}/sqrt(mean_i(K_ij) mean_j(K_ij))\n"); 00134 00135 00136 inherited::declareOptions(ol); 00137 } 00138 00139 void GaussianProcessRegressor::build_() 00140 { 00141 if(expdir!="") 00142 { 00143 if(!force_mkdir(expdir)) 00144 PLERROR("In GaussianProcessRegressor Could not create experiment directory %s",expdir.c_str()); 00145 expdir = abspath(expdir); 00146 } 00147 00148 if (train_set) 00149 { 00150 K.resize(train_set->length(),train_set->length()); 00151 Kxxi.resize(train_set->length()); 00152 alpha.resize(outputsize(),train_set->length()); 00153 meanK.resize(train_set->length()); 00154 n_outputs = train_set->targetsize(); 00155 } 00156 } 00157 00158 int GaussianProcessRegressor::outputsize() const 00159 { 00160 int output_size=0; 00161 if (outputs_def.find("e") != string::npos) 00162 output_size+=n_outputs; 00163 if (outputs_def.find("v") != string::npos) 00164 // we only compute a diagonal output variance 00165 output_size+=n_outputs; 00166 return output_size; 00167 } 00168 00169 void GaussianProcessRegressor::build() 00170 { 00171 inherited::build(); 00172 build_(); 00173 } 00174 00175 void GaussianProcessRegressor::forget() 00176 { 00177 stage = 0; 00178 } 00179 00180 GaussianProcessRegressor::~GaussianProcessRegressor() 00181 { 00182 } 00183 00184 TVec<string> GaussianProcessRegressor::getTrainCostNames() const 00185 { 00186 TVec<string> names(2); 00187 names[0]="log-likelihood"; 00188 names[1]="mse"; 00189 return names; 00190 } 00191 00192 TVec<string> GaussianProcessRegressor::getTestCostNames() const 00193 { return getTrainCostNames(); } 00194 00195 int GaussianProcessRegressor::getTestCostIndex(const string& costname) const 00196 { 00197 TVec<string> costnames = getTestCostNames(); 00198 for(int i=0; i<costnames.length(); i++) 00199 if(costnames[i]==costname) 00200 return i; 00201 return -1; 00202 } 00203 00204 int GaussianProcessRegressor::getTrainCostIndex(const string& costname) const 00205 { 00206 TVec<string> costnames = getTrainCostNames(); 00207 for(int i=0; i<costnames.length(); i++) 00208 if(costnames[i]==costname) 00209 return i; 00210 return -1; 00211 } 00212 00214 double GaussianProcessRegressor::log_density(const Vec& y) const 00215 { 00216 PLERROR("GaussianProcessRegressor::log_density not implemented yet"); 00217 return 0; 00218 } 00219 00221 void GaussianProcessRegressor::expectation(Vec expected_y) const 00222 { 00223 for (int i=0;i<n_outputs;i++) 00224 expected_y[i] = dot(Kxxi,alpha(i)); 00225 } 00226 00228 Vec GaussianProcessRegressor::expectation() const 00229 { 00230 static Vec expected_target; 00231 expected_target.resize(n_outputs); 00232 expectation(expected_target); 00233 return expected_target; 00234 } 00235 00236 void GaussianProcessRegressor::variance(Vec diag_variances) const 00237 { 00238 for (int i=0;i<n_outputs;i++) 00239 { 00240 real v = Kxx; 00241 v -= QFormInverse(noise_sd[i]*noise_sd[i],Kxxi); 00242 diag_variances[i] = v; 00243 } 00244 } 00245 00247 Mat GaussianProcessRegressor::variance() const 00248 { 00249 static Mat var; 00250 if (var.length()!=n_outputs) 00251 { 00252 var.resize(n_outputs,n_outputs); 00253 var.clear(); 00254 } 00255 for (int i=0;i<n_outputs;i++) 00256 { 00257 real v = Kxx; 00258 v -= QFormInverse(noise_sd[i]*noise_sd[i],Kxxi); 00259 var(i,i) = v; 00260 } 00261 return var; 00262 } 00263 00264 void GaussianProcessRegressor::computeOutput(const Vec& input, Vec& output) const 00265 { 00266 setInput_const(input); 00267 int i0=0; 00268 if (outputs_def.find("e") != string::npos) 00269 { 00270 expectation(output.subVec(i0,n_outputs)); 00271 i0+=n_outputs; 00272 } 00273 if (outputs_def.find("v") != string::npos) 00274 { 00275 variance(output.subVec(i0,n_outputs)); 00276 i0+=n_outputs; 00277 } 00278 } 00279 00280 // prediction = E[E[y|x]|training_set] = E[y|x,training_set] 00281 // prediction[j] = sum_i alpha_{ji} K(x,x_i) 00282 // = (K(x,x_i))_i' inv(K+sigma^2[j] I) targets 00283 // 00284 // Var[y[j]|x,training_set] = Var[E[y[j]|x]|training_set] + E[Var[y[j]|x]|training_set] 00285 // where 00286 // Var[E[y[j]|x]|training_set] = K(x,x)- (K(x,x_i))_i' inv(K+sigma^2[j]) (K(x,x_i))_i 00287 // and 00288 // E[Var[y[j]|x]|training_set] = Var[y[j]|x] = sigma^2[j] = noise 00289 // 00290 // costs: 00291 // MSE = sum_j (y[j] - prediction[j])^2 00292 // NLL = sum_j log Normal(y[j];prediction[j],Var[y[j]|x,training_set]) 00293 // 00294 void GaussianProcessRegressor::computeCostsFromOutputs(const Vec& input, const Vec& output, 00295 const Vec& target, Vec& costs) const 00296 { 00297 Vec mu; 00298 static Vec var; 00299 int i0=0; 00300 if (outputs_def.find("e")!=string::npos) 00301 { 00302 mu = output.subVec(i0,n_outputs); 00303 i0+=n_outputs; 00304 } 00305 else 00306 mu = expectation(); 00307 if (outputs_def.find("v")!=string::npos) 00308 { 00309 var = output.subVec(i0,n_outputs); 00310 i0+=n_outputs; 00311 } 00312 else 00313 { 00314 var.resize(n_outputs); 00315 variance(var); 00316 } 00317 real mse = 0; 00318 real logdensity = 0; 00319 for (int i=0;i<n_outputs;i++) 00320 { 00321 real diff=mu[i] - target[i]; 00322 mse += diff*diff; 00323 logdensity += gauss_log_density_var(target[i],mu[i],var[i]+noise_sd[i]*noise_sd[i]); 00324 } 00325 costs[0]=mse; 00326 costs[1]=logdensity; 00327 } 00328 00329 void GaussianProcessRegressor::computeOutputAndCosts(const Vec& input, const Vec& target, 00330 Vec& output, Vec& costs) const 00331 { 00332 computeOutput(input, output); 00333 computeCostsFromOutputs(input, output, target, costs); 00334 } 00335 00336 void GaussianProcessRegressor::computeCostsOnly(const Vec& input, const Vec& target, 00337 Vec& costs) const 00338 { 00339 static Vec tmp_output; 00340 tmp_output.resize(outputsize()); 00341 computeOutputAndCosts(input, target, tmp_output, costs); 00342 } 00343 00344 void GaussianProcessRegressor::train() 00345 { 00346 // compute Gram matrix K 00347 int l=K.length(); 00348 VMat input_rows = train_set.subMatColumns(0,inputsize()); 00349 VMat target_rows = train_set.subMatColumns(inputsize(),targetsize()); 00350 kernel->setDataForKernelMatrix(input_rows); 00351 kernel->computeGramMatrix(K); 00352 00353 // SHOULD WE ADD THE NOISE VARIANCE BEFORE NORMALIZATION? 00354 00355 // optionally "normalize" the gram matrix 00356 if (Gram_matrix_normalization=="centering_a_dotproduct") 00357 { 00358 columnMean(K,meanK); 00359 mean_allK = mean(meanK); 00360 int m=K.mod(); 00361 real mean_allK = mean(meanK); 00362 for (int i=0;i<l;i++) 00363 { 00364 real* Ki = K[i]; 00365 real* Kji_ = &K[0][i]; 00366 for (int j=0;j<=i;j++,Kji_+=m) 00367 { 00368 real Kij = Ki[j] - meanK[i] - meanK[j] + mean_allK; 00369 Ki[j]=Kij; 00370 if (j<i) 00371 *Kji_ =Kij; 00372 } 00373 } 00374 } 00375 else if (Gram_matrix_normalization=="centering_a_distance") 00376 { 00377 columnMean(K,meanK); 00378 mean_allK = mean(meanK); 00379 int m=K.mod(); 00380 real mean_allK = mean(meanK); 00381 for (int i=0;i<l;i++) 00382 { 00383 real* Ki = K[i]; 00384 real* Kji_ = &K[0][i]; 00385 for (int j=0;j<=i;j++,Kji_+=m) 00386 { 00387 real Kij = -0.5*(Ki[j] - meanK[i] - meanK[j] + mean_allK); 00388 Ki[j]=Kij; 00389 if (j<i) 00390 *Kji_ =Kij; 00391 } 00392 } 00393 } 00394 else if (Gram_matrix_normalization=="divisive") 00395 { 00396 columnMean(K,meanK); 00397 int m=K.mod(); 00398 for (int i=0;i<l;i++) 00399 { 00400 real* Ki = K[i]; 00401 real* Kji_ = &K[0][i]; 00402 for (int j=0;j<=i;j++,Kji_+=m) 00403 { 00404 real Kij = Ki[j] / sqrt(meanK[i]*meanK[j]); 00405 Ki[j]=Kij; 00406 if (j<i) 00407 *Kji_ =Kij; 00408 } 00409 } 00410 } 00411 // compute principal eigenvectors 00412 int n_components = max_nb_evectors<0 || max_nb_evectors>l ? l : max_nb_evectors; 00413 eigenVecOfSymmMat(K,n_components,eigenvalues,eigenvectors); 00414 // pre-compute alpha[i]=(K+noise_sd[i]^2 I)^{-1}*targets for regression 00415 for (int i=0;i<n_outputs;i++) 00416 { 00417 VMat target_column = target_rows.subMatColumns(i,1); 00418 inverseCovTimesVec(noise_sd[i]*noise_sd[i],target_column.toMat().toVec(),alpha(i)); 00419 } 00420 00421 } 00422 00423 real GaussianProcessRegressor::BayesianCost() 00424 { 00427 int l=K.length(); 00428 int m=eigenvectors.length(); 00429 real nll = l*n_outputs*Log2Pi; 00430 for (int i=0;i<n_outputs;i++) 00431 { 00432 real sigma2_i=noise_sd[i]*noise_sd[i]; 00433 //nll += QFormInverse(sigma2_i,targets); // y'*inv(C)*y 00434 // add the log det(K+sigma_i^2 I) contribution 00435 if (m<l) 00436 // the last l-m eigenvalues are sigma_i^2 00437 nll += (l-m)*safeflog(sigma2_i); 00438 // while the first m ones are lambda_i + sigma_i^2 00439 for (int j=0;j<m;j++) 00440 nll += safeflog(eigenvalues[i]+sigma2_i); 00441 } 00442 nll *= 0.5; 00443 return nll; 00444 } 00445 00446 // multiply (K+sigma^2 I)^{-1} by vector v, put result in Cinv_v 00447 // TRICK USING PRINCIPAL E-VECTORS OF K: 00448 // Let C = sum_{i=1}^m lambda_i v_i v_i' + sigma^2 I 00449 // with v_i orthonormal eigenvectors. Then, it can also be written 00450 // C = sum_{i=1}^m (lambda_i +sigma^2) v_i v_i' + sum_{i=m+1}^n sigma^2 v_i v_i' 00451 // whose inverse is simply 00452 // inverse(C) = sum_{i=1}^m 1/(lambda_i +sigma^2) v_i v_i' + sum_{i=m+1}^n 1/sigma^2 v_i v_i' 00453 // = sum_{i=1}^m (1/(lambda_i +sigma^2) - 1/sigma^2) v_i v_i' + 1/sigma^2 I 00454 // so 00455 // inverse(C) * u = u/sigma + sum_{i=1}^m (1/(lambda_i+sigma^2) - 1/sigma^2) v_i v_i.u 00456 void GaussianProcessRegressor::inverseCovTimesVec(real sigma2, Vec u, Vec Cinv_u) const 00457 { 00458 int m=eigenvectors.length(); 00459 real one_over_sigma2 = 1.0/sigma2; 00460 multiply(u,one_over_sigma2,Cinv_u); 00461 for (int i=0;i<m;i++) 00462 { 00463 Vec v_i = eigenvectors(i); 00464 real proj = dot(v_i,u); 00465 multiplyAdd(Cinv_u, v_i, proj*(1.0/(eigenvalues[i]+sigma2)-one_over_sigma2), Cinv_u); 00466 } 00467 } 00468 00469 real GaussianProcessRegressor::QFormInverse(real sigma2, Vec u) const 00470 { 00471 int m=eigenvectors.length(); 00472 real one_over_sigma2 = 1.0/sigma2; 00473 real qf = norm(u)*one_over_sigma2; 00474 for (int i=0;i<m;i++) 00475 { 00476 Vec v_i = eigenvectors(i); 00477 real proj = dot(v_i,u); 00478 qf += (1.0/(eigenvalues[i]+sigma2)-one_over_sigma2) * proj*proj; 00479 } 00480 return qf; 00481 } 00482 00483 00484 } // end of namespace PLearn 00485