PLearn: GraphicalBiText.h Source File

00001 #include <plearn_learners/generic/Learner.h> 00002 #include <plearn_learners/language/WordNet/WordNetOntology.h> 00003 #include <plearn/math/random.h> 00004 #include <plearn/math/TMat_maths.h> 00005 #include <time.h> 00006 #include <plearn/math/ProbSparseMatrix.h> 00007 #include <plearn/vmat/SubVMatrix.h> 00008 #include <plearn_learners/language/TextSenseSequenceVMatrix.h> 00009 #include <plearn/vmat/SelectColumnsVMatrix.h> 00010 #include <plearn/vmat/DiskVMatrix.h> 00011 #include <plearn/vmat/ConcatRowsVMatrix.h> 00012 #include <plearn_learners/language/Smoothing/SmoothedProbSparseMatrix.h> 00013 #include <plearn_learners/language/Smoothing/ProbVector.h> 00014 #include <plearn/vmat/Splitter.h> 00015 00016 00017 #define DISCOUNT_MASS 0.7 00018 //#define PROB_PREC 0.0001 is defined in SmoothedProbMatrix 00019 #define MAX_EM_IT 10 00020 #define EM_THRES 0.001 00021 #define INIT_ALPHA 0.80 00022 #define INIT_P_A 0.1 00023 #define DEF_INTERP 0.5 00024 00025 namespace PLearn { 00026 using namespace std; 00027 00028 // Load from file into VMat 00029 VMat loadToVMat(string file,string name, int window, int n_examples); 00030 00031 00032 class GraphicalBiText : public Learner { 00033 00034 00035 public : 00036 // Disambiguation model 00037 // window size used for disambiguation 00038 int window_size; 00039 int n_train_examples; // number of train example in semcor 00040 int n_test_examples; // number of test example in semcor 00041 int n_epoch; // number of epoch in EM-graphical model learning algorithm 00042 string source_path; // path to the ontology 00043 string source_voc; // path to source vocabulary 00044 string target_voc; // path to target vocabulary 00045 string train_file; // Bitext training file 00046 string valid_file; // Bitext validation file 00047 string key_file; // key file for senseval test set 00048 string sensemap_file; // sensmap file for coarse sense 00049 int sensemap_level; // level of sense grouping 1=all grouped 99 = all separated 00050 string semcor_train_path;// Path to semcor vmat 00051 string semcor_valid_path;// Path to semcor vmat 00052 string semcor_valid2_path;// Path to semcor vmat 00053 string semcor_test_path;// Path to semcor vmat 00054 string senseval2_train_path;// Path to Senseval2 train set VMat 00055 00056 00057 // Data 00058 VMat wsd_train ; 00059 VMat wsd_valid ; 00060 VMat wsd_valid2; 00061 VMat wsd_test ; 00062 VMat senseval2_train; 00063 00064 real update_threshold; 00065 00066 string output_dir;// dir for all outputs 00067 00068 private : 00069 00070 // Bitext Model 00071 // Sense table : P(S) 00072 ProbVector pS; 00073 // Probability mass in node c : P(C) 00074 ProbVector pMC; 00075 Vec pC; 00076 // Probability mass of the subtree rooted at c 00077 Vec pTC; 00078 // Probability of stoping in c 00079 Vec pA; 00080 Vec nA; 00081 00082 // target_voc - Sense table : P(F|S) 00083 ProbSparseMatrix nFS; 00084 ProbSparseMatrix pFS; 00085 // source_Voc - Sense table : P(E|S) 00086 ProbSparseMatrix nES;// Part of the graphical model 00087 ProbSparseMatrix pES; 00088 00089 // Common nodes structure 00090 // this structure stores the deepest common nodes for each (source,taget) word couple 00091 TMat<Set> commNode; 00092 //TMat<map<int,int> > commNode; 00093 map<int, Set> sens_to_conceptAncestors; 00094 00095 // target word -> senses map; the equivalent for the source words is in WordNet 00096 map<int, Set> target_word_to_senses; 00097 00098 // Independant bitext model 00099 // target_voc proba 00100 ProbVector pF; 00101 // source_Voc proba 00102 ProbVector pE; 00103 00104 // Joint probability bitext model P(E,F) 00105 ProbSparseMatrix nEF; 00106 ProbSparseMatrix pEF; 00107 00108 // For Entropy computation 00109 ProbSparseMatrix nSE; 00110 ProbSparseMatrix pSE; 00111 ProbSparseMatrix nSEbi; 00112 ProbSparseMatrix pSEbi; 00113 Vec KL; 00114 // store if we should use bitext estimated model for this word 00115 map<int,bool> BiSelect; 00116 00117 //Sense mapping for coarse sense 00118 map<string,string> sensemap; 00119 map <int,int> nodemap; 00120 map <int,int> node_level; 00121 00122 00123 // size of the input data for disambiguation (VMat) 00124 int n_fields; 00125 00126 ProbVector pEbase; 00127 ProbVector pSbase; 00128 ProbVector pSupbi; 00129 Vec nS; 00130 // Context proba 00131 ProbVector pH; 00132 ProbVector pHbase; 00133 ProbVector pHupbi; 00134 00135 ProbSparseMatrix nESbase; 00136 ProbSparseMatrix nESupbi; 00137 SmoothedProbSparseMatrix pESbase;// Estimated on Semcor 00138 SmoothedProbSparseMatrix pESupbi; 00139 00140 // context _Voc - Sense table : P(H|S) 00141 ProbSparseMatrix nHS;// Estimated on semcor 00142 SmoothedProbSparseMatrix pHS; 00143 ProbSparseMatrix nHSupbi;// Updated on bitexts 00144 SmoothedProbSparseMatrix pHSupbi; 00145 00146 // Ontology 00147 WordNetOntology ontology; 00148 int source_wsd_voc_size; 00149 int sense_size; 00150 int ss_size; 00151 00152 // Bitext 00153 // Source Vocabulary 00154 map<int, string> source_id_to_word; 00155 map<string, int> source_word_to_id; 00156 int source_voc_size; 00157 00158 // Target Vocabulary 00159 map<int, string> target_id_to_word; 00160 map<string, int> target_word_to_id; 00161 map<int,real> target_id_to_proba; 00162 int target_voc_size; 00163 00164 Set target_wsd_voc; 00165 int target_wsd_voc_size; 00166 00167 // Bitext Data 00168 Vec train_bitext_tgt; 00169 Vec train_bitext_src; 00170 Vec valid_bitext_tgt; 00171 Vec valid_bitext_src; 00172 00173 00174 // Interpolation coefficients 00175 real alpha_bn; 00176 real alpha_joint; 00177 00178 // Checksum variables 00179 Vec sum_epEC; 00180 Vec sum_fpFC; 00181 Vec sum_cpC; 00182 00183 00184 void compute_likelihood( Vec bitext_src, Vec bitext_tgt,string name, bool update); 00185 int getDeepestCommonAncestor(int s1, int s2); 00186 void compute_pTC(); 00187 void compute_pTC(int word); 00188 void distribute_pS_on_ancestors(int s,real probaToDistribute); 00189 void compute_node_level(); 00190 void compute_pMC(); 00191 void check_set_pA(); 00192 void printNode(int ss,ostream &out_hie); 00193 void update_pWS(ProbSparseMatrix& , int , string); 00194 real compute_efs_likelihood(int e,int f, int se); 00195 real compute_BN_likelihood(int e,int f, bool update, real nb); 00196 void optimize_interp_parameter(Vec tgt,Vec src, string name); 00197 void loadBitext(string train_file_name,string valid_file_name, bool update_voc); 00198 void compute_nodemap(int split_level); 00199 void set_nodemap(int node,int word); 00200 void print_sensemap(); 00201 void build_(); 00202 void init_WSD(); 00203 void init(); 00204 public: 00205 00206 GraphicalBiText(); 00207 virtual ~GraphicalBiText(); 00208 typedef Learner inherited; 00209 PLEARN_DECLARE_OBJECT(GraphicalBiText); 00210 00211 00212 static void declareOptions(OptionList& ol); 00213 void build(); 00214 00215 void use(const Vec& input, Vec& output) { PLERROR("NaiveBayes does not know 'use', only 'computeOutput'"); } 00216 void train(VMat training_set); 00217 void test(); 00218 00219 void train(int n_epoch); 00220 void senseTagBitext(string name); 00221 void check_consitency(); 00222 void print(string name); 00223 void printHierarchy(string name); 00224 void update_WSD_model(string name); 00225 void sensetag_valid_bitext(string name); 00226 void computeKL(); 00227 void loadSensemap(string sensemap_file); 00228 void compute_train_likelihood(string name); 00229 void compute_valid_likelihood(string name); 00230 00231 void test_WSD(VMat wsd_test, string name, TVec<string> v,bool select, real interp = DEF_INTERP); 00232 void setTrainingSet(VMat training_set, bool call_forget); 00233 }; 00234 } // end of namespace PLearn 00235