PLearn: GaussianContinuum.cc Source File

00001 // -*- C++ -*- 00002 00003 // GaussianContinuum.cc 00004 // 00005 // Copyright (C) 2004 Yoshua Bengio & Martin Monperrus 00006 // 00007 // Redistribution and use in source and binary forms, with or without 00008 // modification, are permitted provided that the following conditions are met: 00009 // 00010 // 1. Redistributions of source code must retain the above copyright 00011 // notice, this list of conditions and the following disclaimer. 00012 // 00013 // 2. Redistributions in binary form must reproduce the above copyright 00014 // notice, this list of conditions and the following disclaimer in the 00015 // documentation and/or other materials provided with the distribution. 00016 // 00017 // 3. The name of the authors may not be used to endorse or promote 00018 // products derived from this software without specific prior written 00019 // permission. 00020 // 00021 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00022 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00023 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00024 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00025 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00026 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00027 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00028 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00029 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00030 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00031 // 00032 // This file is part of the PLearn library. For more information on the PLearn 00033 // library, go to the PLearn Web site at www.plearn.org 00034 00035 /* ******************************************************* 00036 * $Id: GaussianContinuum.cc,v 1.4 2004/08/17 15:28:21 larocheh Exp $ 00037 ******************************************************* */ 00038 00039 // Authors: Yoshua Bengio & Martin Monperrus 00040 00044 #include "GaussianContinuum.h" 00045 #include <plearn/vmat/LocalNeighborsDifferencesVMatrix.h> 00046 #include <plearn/var/ProductVariable.h> 00047 #include <plearn/var/PlusVariable.h> 00048 #include <plearn/var/SoftplusVariable.h> 00049 #include <plearn/var/VarRowsVariable.h> 00050 #include <plearn/var/VarRowVariable.h> 00051 #include <plearn/var/SourceVariable.h> 00052 #include <plearn/var/Var_operators.h> 00053 #include <plearn/vmat/ConcatColumnsVMatrix.h> 00054 #include <plearn/math/random.h> 00055 #include <plearn/var/SumOfVariable.h> 00056 #include <plearn/var/TanhVariable.h> 00057 #include <plearn/var/NllSemisphericalGaussianVariable.h> 00058 #include <plearn/var/DiagonalizedFactorsProductVariable.h> 00059 #include <plearn/math/random.h> 00060 #include <plearn/math/plapack.h> 00061 #include <plearn/var/ColumnSumVariable.h> 00062 #include <plearn/vmat/VMat_maths.h> 00063 #include <plearn/vmat/ConcatRowsVMatrix.h> 00064 #include <plearn/vmat/SubVMatrix.h> 00065 #include <plearn/var/PDistributionVariable.h> 00066 #include <plearn_learners/distributions/UniformDistribution.h> 00067 #include <plearn/display/DisplayUtils.h> 00068 #include <plearn/opt/GradientOptimizer.h> 00069 00070 namespace PLearn { 00071 using namespace std; 00072 00073 // les neurones de la couche cachée correspondent à des hyperplans 00074 // la smartInitialization consiste a initialiser ces hyperplans passant 00075 // des points du train_set pris aleatoirement 00076 // comme ca, on est sur de bien quadriller l'espace des points. 00077 // le c correspond a une sorte de contre weight decay 00078 // plus c est grand plus on aura des poids grand et plus on a des neurones tranchés dans l'espace 00079 Mat smartInitialization(VMat v, int n, real c, real regularization) 00080 { 00081 int l = v->length(); 00082 int w = v->width(); 00083 00084 Mat result(n,w); 00085 Mat temp(w,w); 00086 Vec b(w); 00087 b<<c; 00088 00089 int i,j; 00090 00091 for (i=0;i<n;++i) 00092 { 00093 temp.clear(); 00094 for (j=0;j<w;++j) 00095 { 00096 v->getRow(uniform_multinomial_sample(l),temp(j)); 00097 } 00098 // regularization pour eviter 1/ quand on a tire deux fois le meme indice 2/ quand les points sont trops proches 00099 regularizeMatrix(temp,regularization); 00100 result(i) << solveLinearSystem(temp, b); 00101 } 00102 return result; 00103 } 00104 00105 GaussianContinuum::GaussianContinuum() 00106 /* ### Initialize all fields to their default value here */ 00107 : use_noise(false), noise(-1), noise_type("uniform"), n_random_walk_step(0), n_random_walk_per_point(0),save_image_mat(false),min_sigma(0), min_diff(0),print_parameters(false),sm_bigger_than_sn(false),use_best_model(false), n_neighbors(5), n_neighbors_density(5), n_dim(1), compute_cost_every_n_epochs(5), architecture_type("single_neural_network"), output_type("tangent_plane"), 00108 n_hidden_units(-1), batch_size(1), norm_penalization(0), svd_threshold(1e-5), 00109 projection_error_regularization(0) 00110 00111 { 00112 } 00113 00114 PLEARN_IMPLEMENT_OBJECT(GaussianContinuum, "Learns a continuous (uncountable) Gaussian mixture with non-local parametrization", 00115 "This learner implicitly estimates the density of the data through\n" 00116 "a generalization of the Gaussian mixture model and of the TangentLearner\n" 00117 "algorithm (see help on that class). The density is the fixed point of\n" 00118 "a random walk {z_t} that follows the following transition probabilities:\n" 00119 " z_{t+1} sampled from a Gaussian associated with z_t, centered\n" 00120 " at z_t + mu(z_t), with covariance matrix S(z_t).\n" 00121 "The semantic of that random walk is the following (and that is how\n" 00122 "it will be estimated). Given a point z_t, the sample z_{t+1} represents\n" 00123 "a 'near neighbor' of z_t. We assume that the density is smooth enough\n" 00124 "that the cloud of 'near neighbors' around z_t can be modeled by a Gaussian.\n" 00125 "The functions mu(.) and S(.) have globally estimated parameters (for example\n" 00126 "using neural nets or linear functions of x, or linear functions of a basis).\n" 00127 "Here we suppose that the eigenvalues of S(.) come from two groups:\n" 00128 "the first group should correspond to locally estimated principal\n" 00129 "directions of variations and there are no constraints on these eigenvalues\n" 00130 "(except that they are positive), while the second group should correspond\n" 00131 "to 'noise' directions, that have all the same value sigma2_noise\n" 00132 "i.e. it is not necessary to explicitly model the directions of variations\n" 00133 "(the eigenvectors) associated with the second group. In general we expect\n" 00134 "sigma2_noise to be small compared to the first group eigenvalues, which\n" 00135 "means that the Gaussians are flat in the corresponding directions, and\n" 00136 "that the first group variations correspond to modeling a manifold near\n" 00137 "which most of the data lie. Optionally, an embedding corresponding\n" 00138 "to variations associated with the first group of eigenvalues can be learnt\n" 00139 "by choosing for the architecture_type option a value of the form embedding_*.\n" 00140 "Although the density is not available in closed form, it is easy (but maybe slow)\n" 00141 "to sample from it: pick one of the training examples at random and then\n" 00142 "follow the random walk (ideally, a long time). It is also possible in\n" 00143 "principle to obtain a numerical estimate of the density at a point x,\n" 00144 "by sampling enough random walk points around x.\n" 00145 ); 00146 00147 /* MATHEMATICAL DETAILS 00148 00149 * Fixed point of the random walk is the density: 00150 00151 Let p(Z_t) represent the density of the t-th random walk sample Z_t (a r.v.). 00152 To obtain p(Z_{t+1}) we sample Z_t from p(Z_t) and then sample Z_{t+1} 00153 from p(Z_{t+1}|Z_t), using the Gaussian with mean z_t + mu(z_t) and 00154 covariance matrix S(z_t). Thus p(Z_{t+1}=x) = \int_y p(Z_t=y) p(Z_{t+1}=x|Z_t=y) dy. 00155 Then at the fixed point we should have p(Z_t) = p(Z_{t+1)=p(X), i.e. 00156 p(x) = \int_y p(y) p(x|y) dy 00157 which has the same form as a Gaussian mixture, with p(x|y) Gaussian in x, 00158 and the sum replaced by an integral (i.e. there is an uncountable 'number' 00159 of Gaussian components, one at each position y in space). It is possible 00160 to achieve this only because each Gaussian component p(x|y) has mean and variance that 00161 depend on y and on global parameters theta, and those parameters are estimated 00162 from data everywhere, and might generalize to new places. 00163 00164 * How to estimate the density numerically: 00165 00166 Although the density cannot be computed exactly, it can be estimated 00167 using a Gaussian mixture with a finite number of components. Suppose that 00168 we have sampled a set R of random samples on the above random walks 00169 (including also the training data, which we know to come from the 00170 true density). Then we obtain a Monte-Carlo approximation of 00171 the above equation as follows: 00172 p(x) ~=~ average_t p(x|x_t) 00173 00174 where x_t is in R (i.e. sampled from the distribution p(y)). This is 00175 simply a uniformly weighted Gaussian mixture centered on the data points 00176 and on the random walk points. If we want to get a more precise estimator 00177 of p(x), we should sample points more often around x, but then correct 00178 this bias, using importance sampling. A simple way to do this is to 00179 choose more points from to put in R in such a way as to give more 00180 preference to the near neighbors of x in R. Let q_x(x_t) be a discrete 00181 distribution over the elements of R which is non-zero everywhere but 00182 puts more weight on the neighbors of x. Then we create new samples, 00183 to be put in a set R', by performing random walks starting from 00184 points of R with probability q_x(x_t). The resulting estimator 00185 would be 00186 p(x) ~=~ average_{x_t in R'} p(x|x_t) / (q_x(x_t) |R'|). 00187 00188 * How to estimate mu(x) and S(x)? 00189 00190 We propose to estimate mu(x) and S(x) by minimizing the negative 00191 log-likelihood of the neighbors x_j of each training point x_i, 00192 according to the Gaussian with mean x_i + mu(x_i) and covariance 00193 matrix S(x_i), plus possibly some regularization term, such 00194 as weight decay on the parameters of the functions. In this 00195 implementation training proceeds by stochastic gradient, visiting 00196 each example x_i (with all of its neighbors) and then making 00197 a parameter update. 00198 00199 * Parametrization of mu(x) and S(x): 00200 00201 mu(x) is simply the output of a linear or neural-net function of x. 00202 S(x) is more difficult to parametrize. We consider two main solutions 00203 here: (1) semi-spherical (only two variances are considered: on the 00204 manifold and orthogonal to it), or (2) factor model with Cholesky 00205 decomposition for the manifold directions and a single shared variance 00206 for the directions orthogonal to the manifold. Note that we 00207 would prefer to parametrize S(x) in such a way as to make it 00208 easy to compute , v'S(x)^{-1}v for any vector v, and log(det(S(x))). 00209 00210 Consider the derivative of NLL == -log(p(y)) wrt log(p(y|x)): 00211 d(-log(p(y)))/d(log(p(y|x))) = -p(y|x)p(x)/p(y) = -p(x|y). 00212 (this also corresponds to the 'posterior' factor in EM). 00213 00214 The conditional log-likelihood log(p(y|x)) for a neighbor y 00215 of an example x is written 00216 log(p(y|x)) = -0.5(y-x-mu(x))'S(x)^{-1}(y-x-mu(x)) - 0.5*log(det(S(x))) - (n/2)log(2pi). 00217 00218 Hence dNLL/dtheta is obtained from 00219 0.5 p(x|y) (d((y-x-mu(x))'S(x)^{-1}(y-x-mu(x)))/dtheta + d(log(det(S(x))))/dtheta) (1) 00220 which gives significant weight only to the near neighbors y of x. 00221 00222 The gradient wrt mu(x) is in particular 00223 dNLL/dmu(x) = p(x|y) S(x)^{-1} (mu(x)+x-y). 00224 00225 * Semi-spherical covariance model: 00226 00227 The idea of the semi-spherical model is that we assume that the neighbors difference 00228 vector y-x has two components: (a) one along the tangent plane of the manifold, spanned 00229 by a set vectors F_i(x), the rows of F(x) a matrix-valued unconstrained estimated function, 00230 and (b) one orthogonal to that tangent plane. We write z = y-x-mu(x) = z_m + z_n, with z_m the 00231 component on the manifold and z_n the noise component. Since we want z_n orthogonal 00232 to the tangent plane, we choose it such that F z_n = 0. Since z_m is in the span 00233 of the rows F_i of F, we can write it as a linear combination of these rows, with 00234 weights w_i. Let w=(w_1,...w_d), then z_m = F'w. To find w, it is enough to find 00235 the projection of y-x along the tangent plane, which corresponds to the shortest 00236 possible z_n. Minimizing the norm of z_n, equal to ||z-F'w||^2 yields the first order equation 00237 F(z-F'w) = 0 00238 i.e. the solution is 00239 w = (FF')^{-1} Fz. 00240 In practice, this will be done by using a singular value decomposition of F', 00241 F' = U D V' 00242 so w = V D^{-2} V' F z = V D^{-2} V' V D U' z = V D^{-1} U' z. Note that 00243 z_m' z_n = w'F (z - F'w) = 0 hence z_m orthogonal to z_n. 00244 00245 By our model, the covariance matrix can be decomposed in two parts, 00246 S(x) = sigma2_manifold U U' + sigma2_noise N N' 00247 where M=[U | N] is the matrix whose columns are eigenvectors of S(x), with U the e-vectors 00248 along the manifold directions and N the e-vectors along the noise directions. 00249 It is easy to show that one does not need to explicitly represent the 00250 noise eigenvectors N, because both the columns of U and the columns of N 00251 are also eigenvectors of the identity matrix. Hence 00252 S(x) = (sigma2_manifold - sigma2_noise) U U' + sigma2_noise I. 00253 with I the nxn identity matrix. 00254 This can be shown by re-writing I = [U | N]' [U | N] and appriate algebra. 00255 00256 It is also easy to show that S(x)^{-1} z = (1/sigma2_manifold) z_m + (1/sigma2_noise) z_n, 00257 that the quadratic form is 00258 z' S(x)^{-1} z = (1/sigma2_manifold) ||z_m||^2 + (1/sigma2_noise) ||z_n||^2, (2) 00259 and that 00260 log(det(S(x))) = d log(sigma2_manifold) + (n-d) log(sigma2_noise). (3) 00261 00262 How to show the above: 00263 @ We have defined S(x) = M diag(s) M' where s is a vector whose first d elements are sigma2_manifold 00264 and last n-d elements are sigma2_noise, and M=[U | N] are the eigenvectors, or 00265 S(x) = sum_{i=1}^d sigma2_manifold U_i U_i' + sum_{i=d+1}^n sigma2_noise N_i N_i' 00266 where U_i is a column of U and N_i a column of N. Hence 00267 S(x) = sigma2_manifold sum_{i=1}^d U_i U_i' - sigma2_noise sum_{i=1}^d U_i U_i' 00268 + sigma2_noise (sum_{i=1}^d U_i U_i' + sum_{i=d+1}^n N_i N_i') 00269 = (sigma2_manifold - sigma2_noise) sum_{i=1}^d U_i U_i' + sigma2_noise I 00270 = (sigma2_manifold - sigma2_noise) U U' + sigma2_noise I 00271 since sum_{i=1}^n M_i M_i' = M M' = I (since M is full rank). 00272 00273 @ S(x)^{-1} = M diag(s)^{-1} M' = (1/sigma2_manifold - 1/sigma2_noise) U U' + 1/sigma2_noise I 00274 using the same argument as above but replacing all sigma2* by 1/sigma2*. 00275 00276 @ Hence S(x)^{-1} z = S(x)^{-1} (z_m + z_n) 00277 = (1/sigma2_manifold - 1/sigma2_noise) z_m + 1/sigma2_noise (z_m + z_n) 00278 = 1/sigma2_manifold z_m + 1/sigma2_noise z_n 00279 where on the second line we used the fact that U U' acts as the identity 00280 matrix for vectors spanned by the columns of U, which can be shown as follows. 00281 Let z_m = sum_i a_i U_i. Then U U' z_m = sum_i a_i U U' U_i = sum_i a_i U e_i = sum_i a_i U_i = z_m. 00282 00283 @ Also, z' S(x)^{-1} z = (z_m + z_n) (1/sigma2_manifold z_m + 1/sigma2_noise z_n) 00284 = 1/sigma2_manifold ||z_m||^2 + 1/sigma2_noise ||z_n||^2 00285 since by construction z_m . z_n = 0. 00286 00287 @ Finally, log(det(S(x))) = sum_{i=1}^n log(s_i) 00288 = sum_{i=1}^d log(sigma2_manifold) + sum_{i=d+1}^n log(sigma2_noise) 00289 = d log(sigma2_manifold) + (n-d) log(sigma2_noise). 00290 00291 00292 * Gradients on covariance for the semi-spherical model: 00293 00294 We have already shown the gradient of NLL on mu(x) above. We need 00295 also here the gradient on sigma2_manifold, sigma2_noise, and F, all 00296 three of which are supposed to be functions of x (e.g. outputs of 00297 a neural network, so we need to provide the gradient on the output 00298 units of the neural network). Note that the sigma2's must be constrained 00299 to be positive (e.g. by squaring the output, using an exponential 00300 or softplus activation function). 00301 00302 dNLL/dsigma2_manifold = 0.5 p(x|y) ( d/sigma2_manifold - ||z_m||^2/sigma2_manifold^2) 00303 00304 N.B. this is the gradient on the variance, not on the standard deviation. 00305 00306 Proof: Recall eq.(1) and let theta = dsigma2_manifold. Using eq.(2) we obtain 00307 for the first term in (1): 00308 d/dsigma2_manifold (0.5/sigma2_manifold ||z_m||^2) = -0.5||z_m||^2/sigma2_manifold^2. 00309 Using (3) we obtain the second term 00310 d/dsigma2_manifold (0.5 d log(sigma2_manifold)) = 0.5 d/sigma2_manifold. 00311 00312 The same arguments yield the following for the gradient on sigma2_noise: 00313 00314 dNLL/dsigma2_noise = 0.5 p(x|y) ( (n-d)/sigma2_noise - ||z_n||^2/sigma2_noise^2) 00315 00316 00317 Now let us consider the more difficult case of the theta = F_{ij} (i in {1..d}, j in {1..n}). 00318 The result is fortunately simple to write: 00319 00320 dNLL/dF = p(x|y) (1/sigma2_manifold - 1/sigma2_noise) w z_n' 00321 00322 Proof: First we see that the second term in eq.(1) does not depend on F because of eq.(3). 00323 For the first term of eq.(1), we obtain using (2) 00324 d(0.5 z'S(x)^{-1} z)/dF_{ij} 00325 = d/dF_{ij} ((0.5/sigma2_manifold) ||z_m||^2 + (0.5/sigma2_noise) ||z_n||^2) 00326 = d/dF_{ij} ((0.5/sigma2_manifold) ||F'w||^2 + (0.5/sigma2_noise) ||z-F'w||^2) 00327 = (1/sigma2_manifold) (F'w)' d(F'w)/dF_{ij} + (1/sigma2_noise) z_n' d(z-F'w)/dF_{ij} 00328 = (1/sigma2_manifold) (F'w)' d(F'w)/dF_{ij} - (1/sigma2_noise) z_n' d(F'w)/dF_{ij} (4) 00329 Note that w depends of F so we will have to compute two components: 00330 d(F'w)/dF_{ij} = w_i e_j + F' dw/dF_{ij} (5) 00331 Now recall how w depends on F: w = (FF')^{-1} F z, and recall the identity 00332 d(A^{-1})/dx = -A^{-1} dA/dx A^{-1} for square matrix A. Hence 00333 dw/dF_{ij} = - (FF')^{-1} d(FF')/dF_{ij} (FF')^{-1} F z + (FF')^{-1} dF/dF_{ij} z 00334 = - (FF')^{-1} ( F e_j e_i' + e_i e_j' F') w + (FF')^{-1} e_i e_j' z 00335 where we have replaced (FF')^{-1}Fz by w in the last factor of the first term, and 00336 where e_i is the d-vector with all 0's except a 1 at position i, and e_j is the n-vector 00337 with all 0's except a 1 at position j. It is easy to see that dF/dF_{ij} = e_i e_j' 00338 which is the matrix with all 0's except at position (i,j). Then 00339 d(FF')/dF_{ij} = F (dF/dF_{ij})' + dF/dF_{ij} F' = F e_j e_i' + e_i e_j' F'. 00340 00341 We are now ready to plug pop back and plug all these results together. First we plug 00342 the above in (5): 00343 d(F'w)/dF_{ij} = w_i e_j + F' (FF')^{-1} e_i e_j' z - (FF')^{-1} ( F e_j e_i' + e_i e_j' F') w 00344 then plug this back in (4) noting that FF' cancels with (FF')^{-1} everywhere in the sigma2_manifold term 00345 d(0.5 z'S(x)^{-1} z)/dF_{ij} = 00346 (1/sigma2_manifold) (w'F w_i e_j + w'e_i e_j' z - w'(F e_j e_i' + e_i e_j' F') w 00347 - (1/sigma2_noise) w_i z_n'e_j 00348 using z_n'F' = 0, and z_n' (FF')^{-1} = z_n'VD^{-2}V'=0 since z_n is orthogonal to every column of V. 00349 Note: F'(FF')^{-1}F = UDV'(VD^{-2}V')VDU' = UU', and UU'z = UU'(z_m+z_n) = z_m to simplify last term. 00350 In the sigma2_manifold term let us use the facts that (a) each sub-term is a scalar, (b) tr(AB)=tr(BA), 00351 (c) scalar = scalar', and (e) e_i'A e_j = A_{ij} to write everything in matrix form: 00352 (1/sigma2_manifold) (w'F e_j w_i + w'e_i e_j' z - w'(F e_j e_i' + e_i e_j' F') w) 00353 = (1/sigma2_manifold) (w'F e_j e_i' w + z'e_j e_i' w - w'F e_j e_i'w - z'UU'e_j e_i'w 00354 = (1/sigma2_manifold) (e_i'ww'F e_j + e_i'wz'e_j - e_i'ww'Fe_j - e_i'w z_m' e_j 00355 = (1/sigma2_manifold) (ww'F + wz' - ww'F - w z_m')_{ij} 00356 = (1/sigma2_manifold) (wz' - w z_m')_{ij} 00357 = (1/sigma2_manifold) (w z_n')_{ij} 00358 Now let us do the sigma2_noise term: 00359 (1/sigma2_noise) w_i z_n'e_j = (1/sigma2_noise) e_i' w z_n'e_j = (1/sigma2_noise) (w z_n')_{ij} 00360 Putting the sigma2_manifold term and the sigma2_noise term together we obtain in matrix form 00361 d(0.5 z'S(x)^{-1} z)/dF = (1/sigma2_manifold) w z_n' - (1/sigma2_noise) w z_n' 00362 i.e. the final result 00363 d(0.5 z'S(x)^{-1} z)/dF = (1/sigma2_manifold - 1/sigma2_noise) w z_n' 00364 which gives (using dlog(det(S(x)))/dF = ), the claimed statement: 00365 dNLL/dF = p(x|y) (1/sigma2_manifold - 1/sigma2_noise) w z_n' 00366 00367 */ 00368 00369 void GaussianContinuum::declareOptions(OptionList& ol) 00370 { 00371 // ### Declare all of this object's options here 00372 // ### For the "flags" of each option, you should typically specify 00373 // ### one of OptionBase::buildoption, OptionBase::learntoption or 00374 // ### OptionBase::tuningoption. Another possible flag to be combined with 00375 // ### is OptionBase::nosave 00376 00377 declareOption(ol, "n_neighbors", &GaussianContinuum::n_neighbors, OptionBase::buildoption, 00378 "Number of nearest neighbors to consider for gradient descent.\n" 00379 ); 00380 00381 declareOption(ol, "n_neighbors_density", &GaussianContinuum::n_neighbors_density, OptionBase::buildoption, 00382 "Number of nearest neighbors to consider for p(x) density estimation.\n" 00383 ); 00384 00385 declareOption(ol, "n_dim", &GaussianContinuum::n_dim, OptionBase::buildoption, 00386 "Number of tangent vectors to predict.\n" 00387 ); 00388 00389 declareOption(ol, "compute_cost_every_n_epochs", &GaussianContinuum::compute_cost_every_n_epochs, OptionBase::buildoption, 00390 "Frequency of the computation of the cost on the training and validation set. \n" 00391 ); 00392 00393 declareOption(ol, "optimizer", &GaussianContinuum::optimizer, OptionBase::buildoption, 00394 "Optimizer that optimizes the cost function Number of tangent vectors to predict.\n" 00395 ); 00396 00397 declareOption(ol, "variances_transfer_function", &GaussianContinuum::variances_transfer_function, 00398 OptionBase::buildoption, 00399 "Type of output transfer function for predicted variances, to force them to be >0:\n" 00400 " square : take the square\n" 00401 " exp : apply the exponential\n" 00402 " softplus : apply the function log(1+exp(.))\n" 00403 ); 00404 00405 declareOption(ol, "architecture_type", &GaussianContinuum::architecture_type, OptionBase::buildoption, 00406 "For pre-defined tangent_predictor types: \n" 00407 " multi_neural_network : prediction[j] = b[j] + W[j]*tanh(c[j] + V[j]*x), where W[j] has n_hidden_units columns\n" 00408 " where there is a separate set of parameters for each of n_dim tangent vectors to predict.\n" 00409 " single_neural_network : prediction = b + W*tanh(c + V*x), where W has n_hidden_units columns\n" 00410 " where the resulting vector is viewed as a n_dim by n matrix\n" 00411 " linear : prediction = b + W*x\n" 00412 " embedding_neural_network: prediction[k,i] = d(e[k]/d(x[i), where e(x) is an ordinary neural\n" 00413 " network representing the embedding function (see output_type option)\n" 00414 " embedding_quadratic: prediction[k,i] = d(e_k/d(x_i) = A_k x + b_k, where e_k(x) is a quadratic\n" 00415 " form in x, i.e. e_k = x' A_k x + b_k' x\n" 00416 " (empty string): specify explicitly the function with tangent_predictor option\n" 00417 "where (b,W,c,V) are parameters to be optimized.\n" 00418 ); 00419 00420 declareOption(ol, "n_hidden_units", &GaussianContinuum::n_hidden_units, OptionBase::buildoption, 00421 "Number of hidden units (if architecture_type is some kidn of neural network)\n" 00422 ); 00423 00424 declareOption(ol, "output_type", &GaussianContinuum::output_type, OptionBase::buildoption, 00425 "Default value (the only one considered if architecture_type != embedding_*) is\n" 00426 " tangent_plane: output the predicted tangent plane.\n" 00427 " embedding: output the embedding vector (only if architecture_type == embedding_*).\n" 00428 " tangent_plane+embedding: output both (in this order).\n" 00429 ); 00430 00431 00432 declareOption(ol, "batch_size", &GaussianContinuum::batch_size, OptionBase::buildoption, 00433 " how many samples to use to estimate the average gradient before updating the weights\n" 00434 " 0 is equivalent to specifying training_set->length() \n"); 00435 00436 declareOption(ol, "svd_threshold", &GaussianContinuum::svd_threshold, OptionBase::buildoption, 00437 "Threshold to accept singular values of F in solving for linear combination weights on tangent subspace.\n" 00438 ); 00439 00440 declareOption(ol, "print_parameters", &GaussianContinuum::print_parameters, OptionBase::buildoption, 00441 "Indication that the parameters should be printed for the training set points.\n" 00442 ); 00443 00444 declareOption(ol, "sm_bigger_than_sn", &GaussianContinuum::sm_bigger_than_sn, OptionBase::buildoption, 00445 "Indication that sm should always be bigger than sn.\n" 00446 ); 00447 00448 declareOption(ol, "use_best_model", &GaussianContinuum::use_best_model, OptionBase::buildoption, 00449 "Indication that the best model on the validation set should be used.\n" 00450 ); 00451 00452 declareOption(ol, "save_image_mat", &GaussianContinuum::save_image_mat, OptionBase::buildoption, 00453 "Indication that a matrix corresponding to the probabilities of the points on a 2d grid should be created.\n" 00454 ); 00455 00456 declareOption(ol, "upper_y", &GaussianContinuum::upper_y, OptionBase::buildoption, 00457 "Upper bound on the y (second) coordinate.\n" 00458 ); 00459 00460 declareOption(ol, "upper_x", &GaussianContinuum::upper_x, OptionBase::buildoption, 00461 "Lower bound on the x (first) coordinate.\n" 00462 ); 00463 00464 declareOption(ol, "lower_y", &GaussianContinuum::lower_y, OptionBase::buildoption, 00465 "Lower bound on the y (second) coordinate.\n" 00466 ); 00467 00468 declareOption(ol, "lower_x", &GaussianContinuum::lower_x, OptionBase::buildoption, 00469 "Lower bound on the x (first) coordinate.\n" 00470 ); 00471 00472 declareOption(ol, "points_per_dim", &GaussianContinuum::points_per_dim, OptionBase::buildoption, 00473 "Number of points per dimension on the grid.\n" 00474 ); 00475 00476 declareOption(ol, "projection_error_regularization", &GaussianContinuum::projection_error_regularization, OptionBase::buildoption, 00477 "Term added to the linear system matrix involved in fitting subspaces in the projection error computation.\n" 00478 ); 00479 00480 declareOption(ol, "parameters", &GaussianContinuum::parameters, OptionBase::learntoption, 00481 "Parameters of the tangent_predictor function.\n" 00482 ); 00483 00484 declareOption(ol, "Bs", &GaussianContinuum::Bs, OptionBase::learntoption, 00485 "The B matrices for the training set.\n" 00486 ); 00487 00488 declareOption(ol, "Fs", &GaussianContinuum::Fs, OptionBase::learntoption, 00489 "The F (tangent planes) matrices for the training set.\n" 00490 ); 00491 00492 declareOption(ol, "mus", &GaussianContinuum::mus, OptionBase::learntoption, 00493 "The mu vertors for the training set.\n" 00494 ); 00495 00496 declareOption(ol, "sms", &GaussianContinuum::sms, OptionBase::learntoption, 00497 "The sm values for the training set.\n" 00498 ); 00499 00500 declareOption(ol, "sns", &GaussianContinuum::sns, OptionBase::learntoption, 00501 "The sn values for the training set.\n" 00502 ); 00503 00504 declareOption(ol, "min_sigma", &GaussianContinuum::min_sigma, OptionBase::buildoption, 00505 "The minimum value for sigma noise and manifold.\n" 00506 ); 00507 00508 declareOption(ol, "min_diff", &GaussianContinuum::min_sigma, OptionBase::buildoption, 00509 "The minimum value for the difference between sigma manifold and noise.\n" 00510 ); 00511 00512 declareOption(ol, "n_random_walk_step", &GaussianContinuum::n_random_walk_step, OptionBase::buildoption, 00513 "The number of random walk step.\n" 00514 ); 00515 00516 declareOption(ol, "n_random_walk_per_point", &GaussianContinuum::n_random_walk_per_point, OptionBase::buildoption, 00517 "The number of random walks per training set point.\n" 00518 ); 00519 00520 declareOption(ol, "noise", &GaussianContinuum::noise, OptionBase::buildoption, 00521 "Noise parameter for the training data.\n" 00522 ); 00523 00524 declareOption(ol, "noise_type", &GaussianContinuum::noise_type, OptionBase::buildoption, 00525 "Type of the noise (\"uniform\").\n" 00526 ); 00527 00528 declareOption(ol, "use_noise", &GaussianContinuum::use_noise, OptionBase::buildoption, 00529 "Indication that the training should be done using noise on training data.\n" 00530 ); 00531 00532 // Now call the parent class' declareOptions 00533 inherited::declareOptions(ol); 00534 } 00535 00536 void GaussianContinuum::build_() 00537 { 00538 00539 n = PLearner::inputsize_; 00540 00541 if (n>0) 00542 { 00543 Var log_n_examples(1,1,"log(n_examples)"); 00544 00545 if (architecture_type == "embedding_neural_network") 00546 { 00547 if (n_hidden_units <= 0) 00548 PLERROR("GaussianContinuum::Number of hidden units should be positive, now %d\n",n_hidden_units); 00549 00550 x = Var(n); 00551 if(noise > 0) 00552 { 00553 if(noise_type == "uniform") 00554 { 00555 PP<UniformDistribution> temp = new UniformDistribution(); 00556 Vec lower_noise(n); 00557 Vec upper_noise(n); 00558 for(int i=0; i<n; i++) 00559 { 00560 lower_noise[i] = -1*noise; 00561 upper_noise[i] = noise; 00562 } 00563 temp->min = lower_noise; 00564 temp->max = upper_noise; 00565 dist = temp; 00566 } 00567 else PLERROR("In GaussianContinuum::build_() : noise_type %c not defined",noise_type.c_str()); 00568 noise_var = new PDistributionVariable(x,dist); 00569 noise_var->setName(noise_type); 00570 } 00571 else 00572 { 00573 noise_var = new SourceVariable(n,1); 00574 noise_var->setName("no noise"); 00575 for(int i=0; i<n; i++) 00576 noise_var->value[i] = 0; 00577 } 00578 00579 W = Var(n_dim,n_hidden_units,"W "); 00580 c = Var(n_hidden_units,1,"c "); 00581 V = Var(n_hidden_units,n,"V "); 00582 muV = Var(n,n_hidden_units,"muV "); 00583 smV = Var(1,n_hidden_units,"smV "); 00584 smb = Var(1,1,"smB "); 00585 snV = Var(1,n_hidden_units,"snV "); 00586 snb = Var(1,1,"snB "); 00587 00588 Var a = tanh(c + product(V,x)); 00589 tangent_plane = diagonalized_factors_product(W,1-a*a,V); 00590 embedding = product(W,a); 00591 mu = product(muV,a); 00592 Var min_sig = new SourceVariable(1,1); 00593 min_sig->value[0] = min_sigma; 00594 Var min_d = new SourceVariable(1,1); 00595 min_d->value[0] = min_diff; 00596 00597 // Path for noisy mu 00598 Var a_noisy = tanh(c + product(V,x+noise_var)); 00599 mu_noisy = product(muV,a_noisy); 00600 00601 if(sm_bigger_than_sn) 00602 { 00603 sn = softplus(snb + product(snV,a)) + min_sig; 00604 Var diff = softplus(smb + product(smV,a)) + min_d; 00605 sm = sn + diff; 00606 } 00607 else 00608 { 00609 sm = softplus(smb + product(smV,a)) + min_sig; 00610 sn = softplus(snb + product(snV,a)) + min_sig; 00611 } 00612 00613 mu_noisy->setName("mu_noisy "); 00614 tangent_plane->setName("tangent_plane "); 00615 mu->setName("mu "); 00616 sm->setName("sm "); 00617 sn->setName("sn "); 00618 a_noisy->setName("a_noisy "); 00619 a->setName("a "); 00620 embedding->setName("embedding "); 00621 x->setName("x "); 00622 00623 predictor = Func(x, W & c & V & muV & smV & smb & snV & snb, tangent_plane & mu & sm & sn); 00624 00625 if (output_type=="tangent_plane") 00626 output_f = Func(x, tangent_plane); 00627 else if (output_type=="embedding") 00628 output_f = Func(x, embedding); 00629 else if (output_type=="tangent_plane+embedding") 00630 output_f = Func(x, tangent_plane & embedding); 00631 else if(output_type == "tangent_plane_variance_normalized") 00632 output_f = Func(x,tangent_plane & sm); 00633 else if(output_type == "semispherical_gaussian_parameters") 00634 output_f = Func(x,tangent_plane & mu & sm & sn); 00635 output_f_all = Func(x,tangent_plane & mu & sm & sn); 00636 } 00637 else 00638 PLERROR("GaussianContinuum::build, unknown architecture_type option %s (should be 'neural_network', 'linear', or empty string '')\n", 00639 architecture_type.c_str()); 00640 00641 if (parameters.size()>0 && parameters.nelems() == predictor->parameters.nelems()) 00642 predictor->parameters.copyValuesFrom(parameters); 00643 parameters.resize(predictor->parameters.size()); 00644 for (int i=0;i<parameters.size();i++) 00645 parameters[i] = predictor->parameters[i]; 00646 00647 tangent_targets = Var(n_neighbors,n); 00648 00649 Var target_index = Var(1,1); 00650 Var neighbor_indexes = Var(n_neighbors,1); 00651 p_x = Var(train_set->length(),1); 00652 p_target = new VarRowsVariable(p_x,target_index); 00653 p_neighbors =new VarRowsVariable(p_x,neighbor_indexes); 00654 00655 // compute - log ( sum_{neighbors of x} P(neighbor|x) ) according to semi-spherical model 00656 Var nll = nll_semispherical_gaussian(tangent_plane, mu, sm, sn, tangent_targets, p_target, p_neighbors, noise_var, mu_noisy, 00657 use_noise, svd_threshold); // + log_n_examples; 00658 //nll_f = Func(tangent_plane & mu & sm & sn & tangent_targets, nll); 00659 Var knn = new SourceVariable(1,1); 00660 knn->value[0] = n_neighbors; 00661 sum_nll = new ColumnSumVariable(nll) / knn; 00662 cost_of_one_example = Func(x & tangent_targets & target_index & neighbor_indexes, predictor->parameters, sum_nll); 00663 noisy_data = Func(x,x + noise_var); // Func to verify what's the noisy data like (doesn't work so far, this problem will be investigated) 00664 verify_gradient_func = Func(predictor->inputs & tangent_targets & target_index & neighbor_indexes, predictor->parameters & mu_noisy, sum_nll); 00665 00666 if(validation_set == 0) validation_set = train_set; 00667 best_validation_cost = REAL_MAX; 00668 00669 train_nearest_neighbors.resize(train_set.length(), n_neighbors_density); 00670 validation_nearest_neighbors.resize(validation_set.length(), n_neighbors_density); 00671 00672 t_row.resize(n); 00673 Ut_svd.resize(n,n); 00674 V_svd.resize(n_dim,n_dim); 00675 z.resize(n); 00676 zm.resize(n); 00677 zn.resize(n); 00678 x_minus_neighbor.resize(n); 00679 neighbor_row.resize(n); 00680 w.resize(n_dim); 00681 00682 Bs.resize(train_set.length()); 00683 Fs.resize(train_set.length()); 00684 mus.resize(train_set.length(), n); 00685 sms.resize(train_set.length()); 00686 sns.resize(train_set.length()); 00687 00688 } 00689 00690 } 00691 00692 void GaussianContinuum::make_random_walk() 00693 { 00694 if(n_random_walk_step < 1) PLERROR("Number of step in random walk should be at least one"); 00695 if(n_random_walk_per_point < 1) PLERROR("Number of random walk per training set point should be at least one"); 00696 ith_step_generated_set.resize(n_random_walk_step); 00697 00698 Mat generated_set(train_set.length()*n_random_walk_per_point,n); 00699 for(int t=0; t<train_set.length(); t++) 00700 { 00701 train_set->getRow(t,t_row); 00702 output_f_all(t_row); 00703 00704 real this_sm = sm->value[0]; 00705 real this_sn = sn->value[0]; 00706 Vec this_mu(n); this_mu << mu->value; 00707 static Mat this_F(n_dim,n); this_F << tangent_plane->matValue; 00708 00709 // N.B. this is the SVD of F' 00710 lapackSVD(this_F, Ut_svd, S_svd, V_svd); 00711 00712 00713 for(int rwp=0; rwp<n_random_walk_per_point; rwp++) 00714 { 00715 TVec<real> z_m(n_dim); 00716 TVec<real> z(n); 00717 for(int i=0; i<n_dim; i++) 00718 z_m[i] = normal_sample(); 00719 for(int i=0; i<n; i++) 00720 z[i] = normal_sample(); 00721 00722 Vec new_point = generated_set(t*n_random_walk_per_point+rwp); 00723 for(int j=0; j<n; j++) 00724 { 00725 new_point[j] = 0; 00726 for(int k=0; k<n_dim; k++) 00727 new_point[j] += Ut_svd(k,j)*z_m[k]; 00728 new_point[j] *= sqrt(this_sm-this_sn); 00729 new_point[j] += z[j]*sqrt(this_sn); 00730 } 00731 new_point += this_mu + t_row; 00732 } 00733 } 00734 PLearn::save("gen_points_0.psave",generated_set); 00735 ith_step_generated_set[0] = VMat(generated_set); 00736 00737 for(int step=1; step<n_random_walk_step; step++) 00738 { 00739 Mat generated_set(ith_step_generated_set[step-1].length(),n); 00740 for(int t=0; t<ith_step_generated_set[step-1].length(); t++) 00741 { 00742 ith_step_generated_set[step-1]->getRow(t,t_row); 00743 output_f_all(t_row); 00744 00745 real this_sm = sm->value[0]; 00746 real this_sn = sn->value[0]; 00747 Vec this_mu(n); this_mu << mu->value; 00748 static Mat this_F(n_dim,n); this_F << tangent_plane->matValue; 00749 00750 // N.B. this is the SVD of F' 00751 lapackSVD(this_F, Ut_svd, S_svd, V_svd); 00752 00753 TVec<real> z_m(n_dim); 00754 TVec<real> z(n); 00755 for(int i=0; i<n_dim; i++) 00756 z_m[i] = normal_sample(); 00757 for(int i=0; i<n; i++) 00758 z[i] = normal_sample(); 00759 00760 Vec new_point = generated_set(t); 00761 for(int j=0; j<n; j++) 00762 { 00763 new_point[j] = 0; 00764 for(int k=0; k<n_dim; k++) 00765 if(S_svd[k] > svd_threshold) 00766 new_point[j] += Ut_svd(k,j)*z_m[k]; 00767 new_point[j] *= sqrt(this_sm-this_sn); 00768 new_point[j] += z[j]*sqrt(this_sn); 00769 } 00770 new_point += this_mu + t_row; 00771 00772 } 00773 string path = "gen_points_" + tostring(step) + ".psave"; 00774 PLearn::save(path,generated_set); 00775 ith_step_generated_set[step] = VMat(generated_set); 00776 } 00777 00778 train_and_generated_set = vconcat(train_set & ith_step_generated_set); 00779 00780 // Compute Fs, Bs, mus, sms, sns 00781 Fs.resize(train_set.length() + train_set.length()*n_random_walk_per_point*n_random_walk_step); 00782 Bs.resize(train_set.length() + train_set.length()*n_random_walk_per_point*n_random_walk_step); 00783 mus.resize(train_set.length() + train_set.length()*n_random_walk_per_point*n_random_walk_step,n); 00784 sms.resize(train_set.length() + train_set.length()*n_random_walk_per_point*n_random_walk_step); 00785 sns.resize(train_set.length() + train_set.length()*n_random_walk_per_point*n_random_walk_step); 00786 00787 for(int t=0; t<train_and_generated_set.length(); t++) 00788 { 00789 Fs[t].resize(tangent_plane.length(), tangent_plane.width()); 00790 train_and_generated_set->getRow(t,t_row); 00791 predictor->fprop(t_row, Fs[t].toVec() & mus(t) & sms.subVec(t,1) & sns.subVec(t,1)); 00792 00793 // computing B 00794 00795 static Mat F_copy; 00796 F_copy.resize(Fs[t].length(),Fs[t].width()); 00797 F_copy << Fs[t]; 00798 // N.B. this is the SVD of F' 00799 lapackSVD(F_copy, Ut_svd, S_svd, V_svd); 00800 Bs[t].resize(n_dim,train_and_generated_set.width()); 00801 Bs[t].clear(); 00802 for (int k=0;k<S_svd.length();k++) 00803 { 00804 real s_k = S_svd[k]; 00805 if (s_k>svd_threshold) // ignore the components that have too small singular value (more robust solution) 00806 { 00807 real coef = 1/s_k; 00808 for (int i=0;i<n_dim;i++) 00809 { 00810 real* Bi = Bs[t][i]; 00811 for (int j=0;j<n;j++) 00812 Bi[j] += V_svd(i,k)*Ut_svd(k,j)*coef; 00813 } 00814 } 00815 } 00816 00817 } 00818 } 00819 00820 void GaussianContinuum::get_image_matrix(Mat& image, VMat image_points_vmat, int begin, string file_path) 00821 { 00822 VMat reference_set = new SubVMatrix(train_and_generated_set,begin,0,train_and_generated_set.length()-begin,n); 00823 cout << "Creating image matrix: " << file_path << endl; 00824 image.resize(points_per_dim,points_per_dim); 00825 image_nearest_neighbors.resize(points_per_dim*points_per_dim,n_neighbors_density); 00826 // Finding nearest neighbors 00827 00828 for(int t=0; t<image_points_vmat.length(); t++) 00829 { 00830 image_points_vmat->getRow(t,t_row); 00831 TVec<int> nn = image_nearest_neighbors(t); 00832 computeNearestNeighbors(reference_set, t_row, nn); 00833 } 00834 00835 for(int t=0; t<image_points_vmat.length(); t++) 00836 { 00837 00838 image_points_vmat->getRow(t,t_row); 00839 real this_p_x = 0; 00840 // fetching nearest neighbors for density estimation 00841 for(int neighbor=0; neighbor<n_neighbors_density; neighbor++) 00842 { 00843 train_and_generated_set->getRow(begin+image_nearest_neighbors(t,neighbor), neighbor_row); 00844 substract(t_row,neighbor_row,x_minus_neighbor); 00845 substract(x_minus_neighbor,mus(begin+image_nearest_neighbors(t,neighbor)),z); 00846 product(w, Bs[begin+image_nearest_neighbors(t,neighbor)], z); 00847 transposeProduct(zm, Fs[begin+image_nearest_neighbors(t,neighbor)], w); 00848 substract(z,zm,zn); 00849 this_p_x += exp(-0.5*(pownorm(zm,2)/sms[begin+image_nearest_neighbors(t,neighbor)] + pownorm(zn,2)/sns[begin+image_nearest_neighbors(t,neighbor)] 00850 + n_dim*log(sms[begin+image_nearest_neighbors(t,neighbor)]) + (n-n_dim)*log(sns[begin+image_nearest_neighbors(t,neighbor)])) - n/2.0 * Log2Pi); 00851 } 00852 00853 this_p_x /= reference_set.length(); 00854 int y_coord = t/points_per_dim; 00855 int x_coord = t%points_per_dim; 00856 image(points_per_dim - y_coord - 1,x_coord) = this_p_x; 00857 } 00858 PLearn::save(file_path,image); 00859 00860 } 00861 00862 void GaussianContinuum::compute_train_and_validation_costs() 00863 { 00864 // compute the Bs, mus, sms and sns 00865 00866 t_row.resize(train_set.width()); 00867 for(int t=0; t<train_set.length(); t++) 00868 { 00869 Fs[t].resize(tangent_plane.length(), tangent_plane.width()); 00870 train_set->getRow(t,t_row); 00871 predictor->fprop(t_row, Fs[t].toVec() & mus(t) & sms.subVec(t,1) & sns.subVec(t,1)); 00872 00873 // computing B 00874 00875 static Mat F_copy; 00876 F_copy.resize(Fs[t].length(),Fs[t].width()); 00877 F_copy << Fs[t]; 00878 // N.B. this is the SVD of F' 00879 lapackSVD(F_copy, Ut_svd, S_svd, V_svd); 00880 Bs[t].resize(n_dim,train_set.width()); 00881 Bs[t].clear(); 00882 for (int k=0;k<S_svd.length();k++) 00883 { 00884 real s_k = S_svd[k]; 00885 if (s_k>svd_threshold) // ignore the components that have too small singular value (more robust solution) 00886 { 00887 real coef = 1/s_k; 00888 for (int i=0;i<n_dim;i++) 00889 { 00890 real* Bi = Bs[t][i]; 00891 for (int j=0;j<n;j++) 00892 Bi[j] += V_svd(i,k)*Ut_svd(k,j)*coef; 00893 } 00894 } 00895 } 00896 00897 } 00898 00899 // estimate p(x) for the training set 00900 00901 real nll_train = 0; 00902 00903 for(int t=0; t<train_set.length(); t++) 00904 { 00905 00906 train_set->getRow(t,t_row); 00907 p_x->value[t] = 0; 00908 // fetching nearest neighbors for density estimation 00909 for(int neighbor=0; neighbor<n_neighbors_density; neighbor++) 00910 { 00911 train_set->getRow(train_nearest_neighbors(t,neighbor),neighbor_row); 00912 substract(t_row,neighbor_row,x_minus_neighbor); 00913 substract(x_minus_neighbor,mus(train_nearest_neighbors(t,neighbor)),z); 00914 product(w, Bs[train_nearest_neighbors(t,neighbor)], z); 00915 transposeProduct(zm, Fs[train_nearest_neighbors(t,neighbor)], w); 00916 substract(z,zm,zn); 00917 p_x->value[t] += exp(-0.5*(pownorm(zm,2)/sms[train_nearest_neighbors(t,neighbor)] + pownorm(zn,2)/sns[train_nearest_neighbors(t,neighbor)] 00918 + n_dim*log(sms[train_nearest_neighbors(t,neighbor)]) + (n-n_dim)*log(sns[train_nearest_neighbors(t,neighbor)])) - n/2.0 * Log2Pi); 00919 } 00920 p_x->value[t] /= train_set.length(); 00921 nll_train -= log(p_x->value[t]); 00922 00923 if(print_parameters) 00924 { 00925 output_f_all(t_row); 00926 cout << "data point = " << x->value << " parameters = " << tangent_plane->value << " " << mu->value << " " << sm->value << " " << sn->value << " p(x) = " << p_x->value[t] << endl; 00927 } 00928 } 00929 00930 nll_train /= train_set.length(); 00931 00932 cout << "NLL train = " << nll_train << endl; 00933 00934 // estimate p(x) for the validation set 00935 00936 real nll_validation = 0; 00937 00938 for(int t=0; t<validation_set.length(); t++) 00939 { 00940 00941 validation_set->getRow(t,t_row); 00942 real this_p_x = 0; 00943 // fetching nearest neighbors for density estimation 00944 for(int neighbor=0; neighbor<n_neighbors_density; neighbor++) 00945 { 00946 train_set->getRow(validation_nearest_neighbors(t,neighbor), neighbor_row); 00947 substract(t_row,neighbor_row,x_minus_neighbor); 00948 substract(x_minus_neighbor,mus(validation_nearest_neighbors(t,neighbor)),z); 00949 product(w, Bs[validation_nearest_neighbors(t,neighbor)], z); 00950 transposeProduct(zm, Fs[validation_nearest_neighbors(t,neighbor)], w); 00951 substract(z,zm,zn); 00952 this_p_x += exp(-0.5*(pownorm(zm,2)/sms[validation_nearest_neighbors(t,neighbor)] + pownorm(zn,2)/sns[validation_nearest_neighbors(t,neighbor)] 00953 + n_dim*log(sms[validation_nearest_neighbors(t,neighbor)]) + (n-n_dim)*log(sns[validation_nearest_neighbors(t,neighbor)])) - n/2.0 * Log2Pi); 00954 } 00955 00956 this_p_x /= train_set.length(); // When points will be added using a random walk, this will need to be changed (among other things...) 00957 nll_validation -= log(this_p_x); 00958 } 00959 00960 nll_validation /= validation_set.length(); 00961 00962 cout << "NLL validation = " << nll_validation << endl; 00963 00964 if(use_best_model && nll_validation < best_validation_cost) 00965 { 00966 best_validation_cost = nll_validation; 00967 PLearn::save("temp_learner.psave",*this); 00968 } 00969 00970 } 00971 00972 // ### Nothing to add here, simply calls build_ 00973 void GaussianContinuum::build() 00974 { 00975 inherited::build(); 00976 build_(); 00977 } 00978 00979 extern void varDeepCopyField(Var& field, CopiesMap& copies); 00980 00981 void GaussianContinuum::makeDeepCopyFromShallowCopy(map<const void*, void*>& copies) 00982 { inherited::makeDeepCopyFromShallowCopy(copies); 00983 00984 deepCopyField(cost_of_one_example, copies); 00985 varDeepCopyField(b, copies); 00986 varDeepCopyField(W, copies); 00987 varDeepCopyField(c, copies); 00988 varDeepCopyField(V, copies); 00989 varDeepCopyField(tangent_targets, copies); 00990 varDeepCopyField(muV, copies); 00991 varDeepCopyField(smV, copies); 00992 varDeepCopyField(smb, copies); 00993 varDeepCopyField(snV, copies); 00994 varDeepCopyField(snb, copies); 00995 varDeepCopyField(mu, copies); 00996 varDeepCopyField(sm, copies); 00997 varDeepCopyField(sn, copies); 00998 deepCopyField(parameters, copies); 00999 deepCopyField(optimizer, copies); 01000 deepCopyField(predictor, copies); 01001 } 01002 01003 01004 int GaussianContinuum::outputsize() const 01005 { 01006 if(output_type == "tangent_plane_variance_normalized") 01007 return output_f->outputsize-1; 01008 else 01009 return output_f->outputsize; 01010 } 01011 01012 void GaussianContinuum::forget() 01013 { 01014 if (train_set) initializeParams(); 01015 stage = 0; 01016 } 01017 01018 void GaussianContinuum::train() 01019 { 01020 01021 // Creation of points for matlab image matrices 01022 01023 if(save_image_mat) 01024 { 01025 if(n != 2) PLERROR("In GaussianContinuum::train(): Image matrix creation is only implemented for 2d problems"); 01026 01027 real step_x = (upper_x-lower_x)/(points_per_dim-1); 01028 real step_y = (upper_y-lower_y)/(points_per_dim-1); 01029 image_points_mat.resize(points_per_dim*points_per_dim,n); 01030 for(int i=0; i<points_per_dim; i++) 01031 for(int j=0; j<points_per_dim; j++) 01032 { 01033 image_points_mat(i*points_per_dim + j,0) = lower_x + j*step_x; 01034 image_points_mat(i*points_per_dim + j,1) = lower_y + i*step_y; 01035 } 01036 01037 image_points_vmat = VMat(image_points_mat); 01038 image_nearest_neighbors.resize(image_points_mat.length(), n_neighbors_density); 01039 01040 for(int t=0; t<image_points_vmat.length(); t++) 01041 { 01042 image_points_vmat->getRow(t,t_row); 01043 TVec<int> nn = image_nearest_neighbors(t); 01044 computeNearestNeighbors(train_set, t_row, nn); 01045 } 01046 01047 } 01048 01049 // find nearest neighbors... 01050 01051 // ... on the training set 01052 01053 for(int t=0; t<train_set.length(); t++) 01054 { 01055 train_set->getRow(t,t_row); 01056 TVec<int> nn = train_nearest_neighbors(t); 01057 computeNearestNeighbors(train_set, t_row, nn, t); 01058 } 01059 01060 // ... on the validation set 01061 01062 for(int t=0; t<validation_set.length(); t++) 01063 { 01064 validation_set->getRow(t,t_row); 01065 TVec<int> nn = validation_nearest_neighbors(t); 01066 computeNearestNeighbors(train_set, t_row, nn); 01067 } 01068 01069 VMat train_set_with_targets; 01070 VMat targets_vmat; 01071 if (!cost_of_one_example) 01072 PLERROR("GaussianContinuum::train: build has not been run after setTrainingSet!"); 01073 01074 targets_vmat = local_neighbors_differences(train_set, n_neighbors, false, true); 01075 01076 train_set_with_targets = hconcat(train_set, targets_vmat); 01077 train_set_with_targets->defineSizes(inputsize()+inputsize()*n_neighbors+1+n_neighbors,0); 01078 int l = train_set->length(); 01079 //log_n_examples->value[0] = log(real(l)); 01080 int nsamples = batch_size>0 ? batch_size : l; 01081 01082 // This was added for debugging, to see if there is a bug in the Var graph 01083 // or in the usage of the Optimizer and the meanOf function... 01084 01085 while(stage<nstages) 01086 { 01087 for(int t=0; t<train_set.length(); t++) 01088 { 01089 t_row.resize(train_set_with_targets.width()); 01090 train_set_with_targets->getRow(t,t_row); 01091 cost_of_one_example->inputs << t_row; 01092 cost_of_one_example->bproppath.clearGradient(); 01093 sum_nll->gradient[0] = -1*(dynamic_cast<GradientOptimizer*>((Optimizer*)optimizer))->start_learning_rate; 01094 cost_of_one_example->bproppath.fbprop(); 01095 //displayVarGraph(cost_of_one_example->bproppath,true,500); 01096 cost_of_one_example->parameters.updateAndClear(); 01097 cost_of_one_example->bproppath.clearGradient(); 01098 //displayVarGraph(cost_of_one_example->bproppath,true,500); 01099 } 01100 if(stage != 0 && stage%compute_cost_every_n_epochs == 0) 01101 compute_train_and_validation_costs(); 01102 stage++; 01103 } 01104 Var totalcost = meanOf(train_set_with_targets, cost_of_one_example, nsamples); 01105 cout << "----------------" << endl; 01106 01107 stage = 0; 01108 01109 // This is the real part of the program, that doesn't seem to work with the new 01110 // Var graph that uses noise on the data to learn the mus... 01111 01112 if(optimizer) 01113 { 01114 optimizer->setToOptimize(parameters, totalcost); 01115 optimizer->build(); 01116 } 01117 else PLERROR("GaussianContinuum::train can't train without setting an optimizer first!"); 01118 01119 // number of optimizer stages corresponding to one learner stage (one epoch) 01120 int optstage_per_lstage = l/nsamples; 01121 01122 ProgressBar* pb = 0; 01123 if(report_progress>0) 01124 pb = new ProgressBar("Training GaussianContinuum from stage " + tostring(stage) + " to " + tostring(nstages), nstages-stage); 01125 01126 t_row.resize(train_set.width()); 01127 01128 //Vec row(inputsize()+inputsize()*n_neighbors+1+n_neighbors); 01129 //train_set_with_targets->getRow(0,row); 01130 //verify_gradient_func->verifyGradient(row); 01131 01132 int initial_stage = stage; 01133 bool early_stop=false; 01134 while(stage<nstages && !early_stop) 01135 { 01136 optimizer->nstages = optstage_per_lstage; 01137 train_stats->forget(); 01138 optimizer->early_stop = false; 01139 optimizer->optimizeN(*train_stats); 01140 train_stats->finalize(); 01141 if(verbosity>2) 01142 cout << "Epoch " << stage << " train objective: " << train_stats->getMean() << endl; 01143 ++stage; 01144 if(pb) 01145 pb->update(stage-initial_stage); 01146 01147 if(stage != 0 && stage%compute_cost_every_n_epochs == 0) 01148 compute_train_and_validation_costs(); 01149 } 01150 if(verbosity>1) 01151 cout << "EPOCH " << stage << " train objective: " << train_stats->getMean() << endl; 01152 01153 if(pb) 01154 delete pb; 01155 01156 if(use_best_model) 01157 PLearn::load("temp_learner.psave",*this); 01158 01159 //compute_train_and_validation_costs(); 01160 01161 Mat noisy_data_set(validation_set.length(),n); 01162 for(int t=0; t<validation_set.length(); t++) 01163 { 01164 validation_set->getRow(t,t_row); 01165 Vec noisy_point = noisy_data_set(t); 01166 noisy_data(t_row); 01167 noisy_point << x->value; 01168 } 01169 PLearn::save("noisy_data.psave",noisy_data_set); 01170 01171 if(save_image_mat) 01172 { 01173 cout << "Creating image matrix" << endl; 01174 image_prob_mat.resize(points_per_dim,points_per_dim); 01175 Mat image_points(points_per_dim*points_per_dim,2); 01176 Mat image_mu_vectors(points_per_dim*points_per_dim,2); 01177 Mat image_sigma_vectors(points_per_dim*points_per_dim,2); 01178 for(int t=0; t<image_points_vmat.length(); t++) 01179 { 01180 01181 image_points_vmat->getRow(t,t_row); 01182 real this_p_x = 0; 01183 // fetching nearest neighbors for density estimation 01184 for(int neighbor=0; neighbor<n_neighbors_density; neighbor++) 01185 { 01186 train_set->getRow(image_nearest_neighbors(t,neighbor), neighbor_row); 01187 substract(t_row,neighbor_row,x_minus_neighbor); 01188 substract(x_minus_neighbor,mus(image_nearest_neighbors(t,neighbor)),z); 01189 product(w, Bs[image_nearest_neighbors(t,neighbor)], z); 01190 transposeProduct(zm, Fs[image_nearest_neighbors(t,neighbor)], w); 01191 substract(z,zm,zn); 01192 this_p_x += exp(-0.5*(pownorm(zm,2)/sms[image_nearest_neighbors(t,neighbor)] + pownorm(zn,2)/sns[image_nearest_neighbors(t,neighbor)] 01193 + n_dim*log(sms[image_nearest_neighbors(t,neighbor)]) + (n-n_dim)*log(sns[image_nearest_neighbors(t,neighbor)])) - n/2.0 * Log2Pi); 01194 } 01195 01196 this_p_x /= train_set.length(); 01197 int y_coord = t/points_per_dim; 01198 int x_coord = t%points_per_dim; 01199 image_prob_mat(points_per_dim - y_coord - 1,x_coord) = this_p_x; 01200 01201 output_f_all(t_row); 01202 01203 image_points(t,0) = t_row[0]; 01204 image_points(t,1) = t_row[1]; 01205 01206 image_mu_vectors(t) << mu->value; 01207 01208 image_sigma_vectors(t,0) = sm->value[0]; 01209 image_sigma_vectors(t,1) = sn->value[0]; 01210 } 01211 PLearn::save("image.psave",image_prob_mat); 01212 PLearn::save("image_points.psave",image_points); 01213 PLearn::save("image_mu_vectors.psave",image_mu_vectors); 01214 PLearn::save("image_sigma_vectors.psave",image_sigma_vectors); 01215 01216 if(n_random_walk_step > 0) 01217 { 01218 make_random_walk(); 01219 01220 string path = "image_rw_" + tostring(0) + ".psave"; 01221 image_prob_mat.clear(); 01222 get_image_matrix(image_prob_mat,image_points_vmat, 0, path); 01223 01224 for(int i=0; i<n_random_walk_step; i++) 01225 { 01226 path = "image_rw_" + tostring(i+1) + ".psave"; 01227 image_prob_mat.clear(); 01228 get_image_matrix(image_prob_mat,image_points_vmat, train_set.length()*n_random_walk_per_point+train_set.length(),path); 01229 } 01230 } 01231 } 01232 } 01233 01234 void GaussianContinuum::initializeParams() 01235 { 01236 if (seed_>=0) 01237 manual_seed(seed_); 01238 else 01239 PLearn::seed(); 01240 01241 if (architecture_type=="embedding_neural_network") 01242 { 01243 real delta = 1.0 / sqrt(real(inputsize())); 01244 fill_random_uniform(V->value, -delta, delta); 01245 delta = 1.0 / real(n_hidden_units); 01246 fill_random_uniform(W->matValue, -delta, delta); 01247 c->value.clear(); 01248 fill_random_uniform(smV->matValue, -delta, delta); 01249 smb->value.clear(); 01250 fill_random_uniform(smV->matValue, -delta, delta); 01251 snb->value.clear(); 01252 fill_random_uniform(snV->matValue, -delta, delta); 01253 fill_random_uniform(muV->matValue, -delta, delta); 01254 01255 } 01256 else PLERROR("other types not handled yet!"); 01257 01258 for(int i=0; i<p_x.length(); i++) 01259 p_x->value[i] = 1.0/p_x.length(); 01260 01261 if(optimizer) 01262 optimizer->reset(); 01263 } 01264 01265 01266 void GaussianContinuum::computeOutput(const Vec& input, Vec& output) const 01267 { 01268 if(output_type == "tangent_plane_variance_normalized") 01269 { 01270 int nout = outputsize()+1; 01271 Vec temp_output(nout); 01272 temp_output << output_f(input); 01273 Mat F = temp_output.subVec(0,temp_output.length()-1).toMat(n_dim,n); 01274 if(n_dim*n != temp_output.length()-1) PLERROR("WHAT!!!"); 01275 for(int i=0; i<F.length(); i++) 01276 { 01277 real norm = pownorm(F(i),1); 01278 F(i) *= sqrt(temp_output[temp_output.length()-1])/norm; 01279 } 01280 01281 output.resize(temp_output.length()-1); 01282 output << temp_output.subVec(0,temp_output.length()-1); 01283 } 01284 else 01285 { 01286 int nout = outputsize(); 01287 output.resize(nout); 01288 output << output_f(input); 01289 } 01290 } 01291 01292 void GaussianContinuum::computeCostsFromOutputs(const Vec& input, const Vec& output, 01293 const Vec& target, Vec& costs) const 01294 { 01295 01296 } 01297 01298 TVec<string> GaussianContinuum::getTestCostNames() const 01299 { 01300 return getTrainCostNames(); 01301 } 01302 01303 TVec<string> GaussianContinuum::getTrainCostNames() const 01304 { 01305 TVec<string> cost(1); cost[0] = "projection_error"; 01306 return cost; 01307 } 01308 01309 01310 01311 } // end of namespace PLearn