Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

SmoothedProbSparseMatrix.cc

Go to the documentation of this file.
00001 // -*- C++ -*- 00002 00003 // PLearn (A C++ Machine Learning Library) 00004 // Copyright (C) 2003 Christopher Kermorvant 00005 // 00006 00007 // Redistribution and use in source and binary forms, with or without 00008 // modification, are permitted provided that the following conditions are met: 00009 // 00010 // 1. Redistributions of source code must retain the above copyright 00011 // notice, this list of conditions and the following disclaimer. 00012 // 00013 // 2. Redistributions in binary form must reproduce the above copyright 00014 // notice, this list of conditions and the following disclaimer in the 00015 // documentation and/or other materials provided with the distribution. 00016 // 00017 // 3. The name of the authors may not be used to endorse or promote 00018 // products derived from this software without specific prior written 00019 // permission. 00020 // 00021 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00022 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00023 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00024 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00025 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00026 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00027 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00028 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00029 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00030 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00031 // 00032 // This file is part of the PLearn library. For more information on the PLearn 00033 // library, go to the PLearn Web site at www.plearn.org 00034 00037 #include "SmoothedProbSparseMatrix.h" 00038 00039 namespace PLearn { 00040 00041 SmoothedProbSparseMatrix::SmoothedProbSparseMatrix(int n_rows, int n_cols, string name, int mode, bool double_access) 00042 :ProbSparseMatrix(n_rows, n_cols, name, mode, double_access) 00043 { 00044 smoothingMethod = 0; 00045 } 00046 00047 // Normalize with Laplace smoothing : 00048 // Warning : resulting matrices are _NOT_ normalized as such !!! 00049 // if P(x|y)=0 in the matrix, it must be set to 00050 // 1/(getHeight+sumCol(y)) in COLUMN_WISE mode 00051 // 1/(getWidth+sumRow(y)) in COLUMN_WISE 00052 void SmoothedProbSparseMatrix::normalizeCondLaplace(ProbSparseMatrix& nXY, bool clear_nXY) 00053 { 00054 int nXY_height = nXY.getHeight(); 00055 int nXY_width = nXY.getWidth(); 00056 smoothingMethod = 1; 00057 if (mode == ROW_WISE && (nXY.getMode() == ROW_WISE || nXY.isDoubleAccessible())) 00058 { 00059 clear(); 00060 normalizationSum.resize(nXY_height); 00061 for (int i = 0; i < nXY_height; i++) 00062 { 00063 // Laplace smoothing normalization 00064 real sum_row_i = nXY.sumRow(i)+nXY_width; 00065 // Store normalization sum 00066 normalizationSum[i] = sum_row_i; 00067 map<int, real>& row_i = nXY.getRow(i); 00068 for (map<int, real>::iterator it = row_i.begin(); it != row_i.end(); ++it) 00069 { 00070 int j = it->first; 00071 real nij = it->second; 00072 // Laplace smoothing 00073 if (nij > 0.0){ 00074 real pij = (nij +1)/ sum_row_i; 00075 set(i, j, pij); 00076 } 00077 } 00078 } 00079 if (clear_nXY) 00080 nXY.clear(); 00081 } else if (mode == COLUMN_WISE && (nXY.getMode() == COLUMN_WISE || nXY.isDoubleAccessible())){ 00082 clear(); 00083 normalizationSum.resize(nXY_width); 00084 for (int j = 0; j < nXY_width; j++){ 00085 // Laplace smoothing normalization 00086 real sum_col_j = nXY.sumCol(j)+nXY_height; 00087 // Store normalization sum 00088 normalizationSum[j] = sum_col_j; 00089 map<int, real>& col_j = nXY.getCol(j); 00090 for (map<int, real>::iterator it = col_j.begin(); it != col_j.end(); ++it){ 00091 int i = it->first; 00092 real nij = it->second; 00093 // Laplace smoothing 00094 if (nij > 0.0){ 00095 set(i, j, (nij+1) / sum_col_j); 00096 } 00097 } 00098 } 00099 if (clear_nXY) 00100 nXY.clear(); 00101 } else{ 00102 PLERROR("pXY and nXY accessibility modes must match"); 00103 } 00104 } 00105 00106 void SmoothedProbSparseMatrix::normalizeCondBackoff(ProbSparseMatrix& nXY, real disc, Vec& bDist, bool clear_nXY,bool shadow) 00107 { 00108 // disc is the percent of minial value to discount : discval = minvalue*disc 00109 // In case of integer counts, the discounted value is 1*disc = disc 00110 00111 int i,j; 00112 real nij,pij; 00113 real minval,discval; 00114 int nXY_height = nXY.getHeight(); 00115 int nXY_width = nXY.getWidth(); 00116 // Shadowing or non shadowing smoothing 00117 if(shadow){ 00118 smoothingMethod = 2; 00119 }else{ 00120 smoothingMethod = 3; 00121 } 00122 00123 00124 00125 00126 // Copy Backoff Distribution 00127 backoffDist.resize(bDist.size()); 00128 backoffDist << bDist; 00129 if (mode == ROW_WISE && (nXY.getMode() == ROW_WISE || nXY.isDoubleAccessible())){ 00130 clear(); 00131 if (backoffDist.size()!=nXY_width)PLERROR("Wrong dimension for backoffDistribution"); 00132 normalizationSum.resize(nXY_height);normalizationSum.clear(); 00133 backoffNormalization.resize(nXY_height);backoffNormalization.clear(); 00134 discountedMass.resize(nXY_height);discountedMass.clear(); 00135 for (int i = 0; i < nXY_height; i++){ 00136 // normalization 00137 real sum_row_i = nXY.sumRow(i); 00138 if (sum_row_i==0){ 00139 // if there is no count in this column : uniform distribution 00140 normalizationSum[j]=nXY_width; 00141 }else{ 00142 // Store normalization sum 00143 normalizationSum[i] = sum_row_i; 00144 } 00145 backoffNormalization[i]= 1.0; 00146 map<int, real>& row_i = nXY.getRow(i); 00147 // compute minial value 00148 minval=FLT_MAX; 00149 for (map<int, real>::iterator it = row_i.begin(); it != row_i.end(); ++it){ 00150 minval= it->second<minval?it->second:minval; 00151 } 00152 discval = minval*disc; 00153 for (map<int, real>::iterator it = row_i.begin(); it != row_i.end(); ++it){ 00154 j = it->first; 00155 nij = it->second; 00156 if(nij>discval){ 00157 discountedMass[i]+=discval; 00158 // Discount 00159 pij = (nij -discval)/ sum_row_i; 00160 if (pij<0) PLERROR("modified count < 0 in Backoff Smoothing SmoothedProbSparseMatrix %s",nXY.getName().c_str()); 00161 // update backoff normalization factor 00162 backoffNormalization[i]-= backoffDist[j]; 00163 }else{ 00164 pij = nij/ sum_row_i; 00165 } 00166 // Set modified count 00167 set(i, j,pij); 00168 00169 } 00170 if(discountedMass[i]==0)PLERROR("Discounted mass is null but count are not null in %s line %d",nXY.getName().c_str(),i); 00171 } 00172 00173 if (clear_nXY)nXY.clear(); 00174 } else if (mode == COLUMN_WISE && (nXY.getMode() == COLUMN_WISE || nXY.isDoubleAccessible())){ 00175 clear(); 00176 normalizationSum.resize(nXY_width);normalizationSum.clear(); 00177 backoffNormalization.resize(nXY_width);backoffNormalization.clear(); 00178 discountedMass.resize(nXY_width);discountedMass.clear(); 00179 for ( j = 0; j < nXY_width; j++){ 00180 // normalization 00181 real sum_col_j = nXY.sumCol(j); 00182 if (sum_col_j==0){ 00183 // if there is no count in this column : uniform distribution 00184 normalizationSum[j]=nXY_height; 00185 continue; 00186 }else{ 00187 // Store normalization sum 00188 normalizationSum[j] = sum_col_j; 00189 } 00190 00191 backoffNormalization[j]= 1.0; 00192 map<int, real>& col_j = nXY.getCol(j); 00193 // compute minimal value 00194 minval=FLT_MAX; 00195 for (map<int, real>::iterator it = col_j.begin(); it != col_j.end(); ++it){ 00196 minval= (it->second<minval && it->second!=0) ?it->second:minval; 00197 } 00198 discval = minval*disc; 00199 for (map<int, real>::iterator it = col_j.begin(); it != col_j.end(); ++it){ 00200 i = it->first; 00201 nij = it->second; 00202 if(nij>discval){ 00203 discountedMass[j]+=discval; 00204 // Discount 00205 pij = (nij -discval)/ sum_col_j; 00206 if (pij<0) PLERROR("modified count < 0 in Backoff Smoothing SmoothedProbSparseMatrix %s : i=%d j=%d p=%f",nXY.getName().c_str(),i,j,pij); 00207 // update backoff normalization factor 00208 backoffNormalization[j]-=backoffDist[i]; 00209 }else{ 00210 pij = nij / sum_col_j; 00211 } 00212 if(pij<=0 || pij>1) PLERROR("Invalide smoothed probability %f in %s",pij,nXY.getName().c_str()); 00213 set(i, j, pij); 00214 } 00215 if(discountedMass[j]==0){ 00216 PLERROR("Discounted mass is null but count are not null in %s col %d",nXY.getName().c_str(),j); 00217 } 00218 } 00219 00220 if (clear_nXY)nXY.clear(); 00221 }else{ 00222 PLERROR("pXY and nXY accessibility modes must match"); 00223 } 00224 } 00225 00226 real SmoothedProbSparseMatrix::get(int i, int j) 00227 { 00228 #ifdef BOUNDCHECK 00229 if (i < 0 || i >= height || j < 0 || j >= width) 00230 PLERROR("SmoothedProbSparseMatrix.get : out-of-bound access to (%d, %d), dims = (%d, %d)", i, j, height, width); 00231 #endif 00232 00233 // If the matrix is not yet smoothed 00234 if(smoothingMethod==0){ 00235 return ProbSparseMatrix::get(i,j); 00236 } 00237 if (mode == ROW_WISE){ 00238 map<int, real>& row_i = rows[i]; 00239 map<int, real>::iterator it = row_i.find(j); 00240 if (it == row_i.end()){ 00241 // if no data in this column : uniform distribution 00242 if (discountedMass[i]==0)return 1/normalizationSum[i]; 00243 // Laplace smoothing 00244 if(smoothingMethod==1)return 1/normalizationSum[i]; 00245 // Backoff smoothing 00246 if(smoothingMethod==2)return discountedMass[i]*backoffDist[j]/(normalizationSum[i]* backoffNormalization[i]); 00247 // Non-shadowing backoff 00248 if(smoothingMethod==3)return discountedMass[i]*backoffDist[j]/(normalizationSum[i]); 00249 }else{ 00250 // Non-shadowing backoff 00251 if(smoothingMethod==3)return it->second+discountedMass[i]*backoffDist[j]/(normalizationSum[i]); 00252 return it->second; 00253 } 00254 } else{ 00255 map<int, real>& col_j = cols[j]; 00256 map<int, real>::iterator it = col_j.find(i); 00257 if (it == col_j.end()){ 00258 // if no data in this column : uniform distribution 00259 if (discountedMass[j]==0)return 1/normalizationSum[j]; 00260 // Laplace smoothing 00261 if(smoothingMethod==1)return 1/normalizationSum[j]; 00262 // Backoff smoothing 00263 if(smoothingMethod==2)return discountedMass[j]*backoffDist[i]/(normalizationSum[j]*backoffNormalization[j]); 00264 // Non-shadowing backoff 00265 if(smoothingMethod==3)return discountedMass[j]*backoffDist[i]/(normalizationSum[j]); 00266 }else{ 00267 // Non-shadowing backoff 00268 if(smoothingMethod==3)return it->second+discountedMass[j]*backoffDist[i]/(normalizationSum[j]); 00269 return it->second; 00270 } 00271 } 00272 return; 00273 } 00274 00275 00276 bool SmoothedProbSparseMatrix::checkCondProbIntegrity() 00277 { 00278 real sum = 0.0; 00279 real backsum; 00280 int null_size; 00281 if (normalizationSum.size()==0)return false; 00282 //cout << " CheckCondIntegrity : mode " <<smoothingMethod; 00283 if (mode == ROW_WISE){ 00284 for (int i = 0; i < height; i++){ 00285 map<int, real>& row_i = rows[i]; 00286 00287 if(smoothingMethod==1)sum = (width-row_i.size())/normalizationSum[i]; 00288 else if(smoothingMethod==2||smoothingMethod==3)sum = discountedMass[i]/normalizationSum[i]; 00289 backsum=0; 00290 00291 00292 for (map<int, real>::iterator it = row_i.begin(); it != row_i.end(); ++it){ 00293 sum += it->second; 00294 if (smoothingMethod==2)backsum +=backoffDist[it->first]; 00295 } 00296 if (smoothingMethod==2){ 00297 if (fabs(1.0 - backsum - backoffNormalization[i]) > 1e-4 ){ 00298 cout << " Inconsistent backoff normalization " << i << " : "<<backoffNormalization[i]<< " "<< backsum; 00299 return false; 00300 } 00301 } 00302 if (fabs(sum - 1.0) > 1e-4 && (sum > 0.0)) 00303 cout << " Inconsistent line " << i << " sum = "<< sum; 00304 return false; 00305 } 00306 return true; 00307 }else{ 00308 for (int j = 0; j < width; j++){ 00309 map<int, real>& col_j = cols[j]; 00310 if(smoothingMethod==1)sum = (height-col_j.size())/normalizationSum[j]; 00311 else if(smoothingMethod==2 || smoothingMethod==3)sum = discountedMass[j]/normalizationSum[j]; 00312 00313 backsum=0; 00314 00315 for (map<int, real>::iterator it = col_j.begin(); it != col_j.end(); ++it){ 00316 sum += it->second; 00317 if (smoothingMethod==2)backsum +=backoffDist[it->first]; 00318 } 00319 if (smoothingMethod==2){ 00320 if (fabs(1.0 - backsum - backoffNormalization[j]) > 1e-4 ){ 00321 cout << " Inconsistent backoff normalization " << j << " : "<<backoffNormalization[j]<< " "<< backsum; 00322 return false; 00323 } 00324 } 00325 if(fabs(sum - 1.0) > 1e-4 && (sum > 0.0)){ 00326 cout << " Inconsistent column " << j << " sum = "<< sum; 00327 return false; 00328 } 00329 } 00330 return true; 00331 } 00332 } 00333 00334 void SmoothedProbSparseMatrix::write(PStream& out) const 00335 { 00336 ProbSparseMatrix::write(out); 00337 string class_name = getClassName(); 00338 switch(out.outmode) 00339 { 00340 case PStream::raw_ascii : 00341 case PStream::pretty_ascii : 00342 PLERROR("raw/pretty_ascii write not implemented in %s", class_name.c_str()); 00343 break; 00344 case PStream::raw_binary : 00345 PLERROR("raw_binary write not implemented in %s", class_name.c_str()); 00346 break; 00347 case PStream::plearn_binary : 00348 case PStream::plearn_ascii : 00349 out.write(class_name + "("); 00350 out << smoothingMethod; 00351 out << normalizationSum; 00352 out << backoffDist; 00353 out << backoffNormalization; 00354 out << discountedMass; 00355 out.write(")\n"); 00356 break; 00357 default: 00358 PLERROR("unknown outmode in %s::write(PStream& out)", class_name.c_str()); 00359 break; 00360 } 00361 } 00362 00363 void SmoothedProbSparseMatrix::read(PStream& in) 00364 { 00365 ProbSparseMatrix::read(in); 00366 string class_name = getClassName(); 00367 switch (in.inmode) 00368 { 00369 case PStream::raw_ascii : 00370 PLERROR("raw_ascii read not implemented in %s", class_name.c_str()); 00371 break; 00372 case PStream::raw_binary : 00373 PLERROR("raw_binary read not implemented in %s", class_name.c_str()); 00374 break; 00375 case PStream::plearn_ascii : 00376 case PStream::plearn_binary : 00377 { 00378 in.skipBlanksAndCommentsAndSeparators(); 00379 string word(class_name.size() + 1, ' '); 00380 for (unsigned int i = 0; i < class_name.size() + 1; i++) 00381 in.get(word[i]); 00382 if (word != class_name + "(") 00383 PLERROR("in %s::(PStream& in), '%s' is not a proper header", class_name.c_str(), word.c_str()); 00384 in.skipBlanksAndCommentsAndSeparators(); 00385 in >> smoothingMethod; 00386 in.skipBlanksAndCommentsAndSeparators(); 00387 in >> normalizationSum; 00388 in.skipBlanksAndCommentsAndSeparators(); 00389 in >> backoffDist; 00390 in.skipBlanksAndCommentsAndSeparators(); 00391 in >> backoffNormalization; 00392 in.skipBlanksAndCommentsAndSeparators(); 00393 in >> discountedMass; 00394 in.skipBlanksAndCommentsAndSeparators(); 00395 int c = in.get(); 00396 if(c != ')') 00397 PLERROR("in %s::(PStream& in), expected a closing parenthesis, found '%c'", class_name.c_str(), c); 00398 } 00399 break; 00400 default: 00401 PLERROR("unknown inmode in %s::write(PStream& out)", class_name.c_str()); 00402 break; 00403 } 00404 } 00405 00406 void ComplementedProbSparseMatrix::complement(ProbSparseMatrix& nXY, bool clear_nXY) 00407 { 00408 int nXY_height = nXY.getHeight(); 00409 int nXY_width = nXY.getWidth(); 00410 00411 rowSum.resize(nXY_height); 00412 columnSum.resize(nXY_width); 00413 if (mode == ROW_WISE && (nXY.getMode() == ROW_WISE || nXY.isDoubleAccessible())){ 00414 for (int i = 0; i < nXY_height; i++){ 00415 map<int, real>& row_i = nXY.getRow(i); 00416 for (map<int, real>::iterator it = row_i.begin(); it != row_i.end(); ++it){ 00417 rowSum[i]+=it->second; 00418 columnSum[it->first]+=it->second; 00419 grandSum+=it->second; 00420 } 00421 } 00422 00423 if (clear_nXY) 00424 nXY.clear(); 00425 } else if (mode == COLUMN_WISE && (nXY.getMode() == COLUMN_WISE || nXY.isDoubleAccessible())){ 00426 00427 for (int j = 0; j < nXY_width; j++){ 00428 map<int, real>& col_j = nXY.getCol(j); 00429 for (map<int, real>::iterator it = col_j.begin(); it != col_j.end(); ++it){ 00430 rowSum[it->first]+=it->second; 00431 columnSum[j]+=it->second; 00432 grandSum+=it->second; 00433 } 00434 } 00435 if (clear_nXY) 00436 nXY.clear(); 00437 } else{ 00438 PLERROR("pXY and nXY accessibility modes must match"); 00439 } 00440 } 00441 00442 real ComplementedProbSparseMatrix::get(int i, int j) 00443 { 00444 #ifdef BOUNDCHECK 00445 if (i < 0 || i >= height || j < 0 || j >= width) 00446 PLERROR("SmoothedProbSparseMatrix.get : out-of-bound access to (%d, %d), dims = (%d, %d)", i, j, height, width); 00447 #endif 00448 if (mode == ROW_WISE){ 00449 map<int, real>& row_i = rows[i]; 00450 map<int, real>::iterator it = row_i.find(j); 00451 if (it == row_i.end()){ 00452 return rowSum[j]/(grandSum-columnSum[i]); 00453 }else{ 00454 return (rowSum[j]-it->second)/(grandSum-columnSum[i]); 00455 } 00456 } else{ 00457 map<int, real>& col_j = cols[j]; 00458 map<int, real>::iterator it = col_j.find(i); 00459 if (it == col_j.end()){ 00460 return columnSum[j]/(grandSum-rowSum[i]); 00461 }else{ 00462 00463 return (columnSum[j]-it->second)/(grandSum-rowSum[i]); 00464 } 00465 } 00466 } 00467 00468 bool ComplementedProbSparseMatrix::checkCondProbIntegrity() 00469 { 00470 real sum = 0.0; 00471 00472 int null_size; 00473 // TO BE DONE 00474 00475 if (mode == ROW_WISE){ 00476 for (int i = 0; i < height; i++){ 00477 map<int, real>& row_i = rows[i]; 00478 00479 for (map<int, real>::iterator it = row_i.begin(); it != row_i.end(); ++it){ 00480 00481 } 00482 00483 if (fabs(sum - 1.0) > 1e-4 && (sum > 0.0)) 00484 cout << " Inconsistent line " << i << " sum = "<< sum; 00485 return false; 00486 } 00487 return true; 00488 }else{ 00489 for (int j = 0; j < width; j++){ 00490 map<int, real>& col_j = cols[j]; 00491 00492 for (map<int, real>::iterator it = col_j.begin(); it != col_j.end(); ++it){ 00493 00494 } 00495 00496 if(fabs(sum - 1.0) > 1e-4 && (sum > 0.0)){ 00497 cout << " Inconsistent column " << j << " sum = "<< sum; 00498 return false; 00499 } 00500 } 00501 return true; 00502 } 00503 } 00504 00505 00506 }

Generated on Tue Aug 17 16:05:50 2004 for PLearn by doxygen 1.3.7