00001 // -*- C++ -*- 00002 00003 // PLearn (A C++ Machine Learning Library) 00004 // Copyright (C) 2003 Christopher Kermorvant 00005 // 00006 00007 // Redistribution and use in source and binary forms, with or without 00008 // modification, are permitted provided that the following conditions are met: 00009 // 00010 // 1. Redistributions of source code must retain the above copyright 00011 // notice, this list of conditions and the following disclaimer. 00012 // 00013 // 2. Redistributions in binary form must reproduce the above copyright 00014 // notice, this list of conditions and the following disclaimer in the 00015 // documentation and/or other materials provided with the distribution. 00016 // 00017 // 3. The name of the authors may not be used to endorse or promote 00018 // products derived from this software without specific prior written 00019 // permission. 00020 // 00021 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00022 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00023 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00024 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00025 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00026 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00027 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00028 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00029 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00030 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00031 // 00032 // This file is part of the PLearn library. For more information on the PLearn 00033 // library, go to the PLearn Web site at www.plearn.org 00034 00037 #include "ProbVector.h" 00038 00039 namespace PLearn { 00040 00041 void ProbVector::smoothNormalize(string name,real discounting_value) 00042 { 00043 // smooth by discounting and Normalize 00044 00045 #ifdef DEBUG 00046 cout << "Smoothing "<< name; 00047 #endif 00048 bool err_smooth_flag=false; 00049 int f; 00050 int size = length_; 00051 real *v = data(); 00052 real word_seen=0; 00053 real sum_discounted=0.0; 00054 real sum_v = 0; 00055 00056 for(f=0;f<size;f++){ 00057 if (v[f]>discounting_value){ 00058 sum_v+=v[f]; 00059 v[f]-=discounting_value; 00060 sum_discounted +=discounting_value; 00061 word_seen++; 00062 continue; 00063 } 00064 // in the case of non integer counts (typically during E.M. algo) 00065 if (v[f]>0 && v[f]<discounting_value){ 00066 sum_v+=v[f]; 00067 err_smooth_flag=true; 00068 } 00069 } 00070 #ifdef DEBUG 00071 cout << ": discounted " << sum_discounted << " from " << word_seen << " seen events summing " <<sum_v ; 00072 #endif 00073 // distribute discounted mass 00074 real unseen_prob=sum_discounted/(sum_v*size); 00075 word_seen=0; 00076 for(f=0;f<size;f++){ 00077 // Distribute on both seen and unseen events 00078 //if (v[f]<=discounting_value){ 00079 word_seen++; 00080 v[f]/=sum_v; 00081 v[f]+= unseen_prob; 00082 // }else{ 00083 // v[f]/=sum_v; 00084 // } 00085 } 00086 #ifdef DEBUG 00087 cout << " redistribute " << unseen_prob << " to "<< word_seen << " unseen events" <<endl; 00088 #endif 00089 if(err_smooth_flag)PLWARNING("minimal value < discounted value in Backoff Smoothing smoothNormalize a probVector"); 00090 } 00091 00092 void ProbVector::normalize() 00093 { 00094 // Normalize the vector (sum = 1) 00095 int f; 00096 int size = length_; 00097 real *v = data(); 00098 real sum_v = 0; 00099 00100 for(f=0;f<size;f++){ 00101 sum_v+=v[f]; 00102 } 00103 for(f=0;f<size;f++){ 00104 v[f]/=sum_v; 00105 } 00106 } 00107 00108 }