Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

GraphicalBiText.cc

Go to the documentation of this file.
00001 #include "GraphicalBiText.h" 00002 00003 00004 00005 namespace PLearn { 00006 using namespace std; 00007 00008 PLEARN_IMPLEMENT_OBJECT(GraphicalBiText, "Probabilistically tag a bitext (english-other language) with senses from WordNet", "NO HELP"); 00009 00010 GraphicalBiText::GraphicalBiText() // DEFAULT VALUES FOR ALL OPTIONS 00011 : 00012 window_size(3), 00013 n_epoch(5), 00014 source_path("/u/larocheh/myUserExp/WSD/features/world3"), 00015 semcor_train_path("/u/kermorvc/Data/Semcor/semcor1.7/1/train_corpus_all_wn17"), 00016 semcor_valid_path("/u/kermorvc/Data/Semcor/semcor1.7/1/valid1_corpus_all_wn17"), 00017 semcor_valid2_path("/u/kermorvc/Data/Semcor/semcor1.7/1/valid2_corpus_all_wn17"), 00018 semcor_test_path("/u/kermorvc/Data/Semcor/semcor1.7/1/test_corpus_all_wn17"), 00019 senseval2_train_path("/u/kermorvc/Data/Senseval/english-lex-sample/train/eng-lex_world3"), 00020 update_threshold(0), 00021 output_dir("./") 00022 { 00023 00024 } 00025 00026 GraphicalBiText::~GraphicalBiText() 00027 { 00028 00029 } 00030 00031 void GraphicalBiText::declareOptions(OptionList& ol) 00032 { 00033 declareOption(ol, "window_size", & GraphicalBiText::window_size, OptionBase::buildoption, 00034 " size of the context window for disambiguation (same on the rigth side and on the left side)\n"); 00035 declareOption(ol, "n_epoch", & GraphicalBiText::n_epoch, OptionBase::buildoption, 00036 " number of iterations in the EM learning algorithm\n"); 00037 declareOption(ol, "source_path", & GraphicalBiText::source_path, OptionBase::buildoption, 00038 " path to the ontology\n"); 00039 declareOption(ol, "source_voc", & GraphicalBiText::source_voc, OptionBase::buildoption, 00040 " path to the source language vocabulary\n"); 00041 declareOption(ol, "target_voc", & GraphicalBiText::target_voc, OptionBase::buildoption, 00042 " path to the target language vocabulary\n"); 00043 declareOption(ol, "train_file", & GraphicalBiText::train_file, OptionBase::buildoption, 00044 " path to the bitext training file\n"); 00045 declareOption(ol, "valid_file", & GraphicalBiText::valid_file, OptionBase::buildoption, 00046 " path to the bitext validation file\n"); 00047 declareOption(ol, "sensemap_file", & GraphicalBiText::sensemap_file, OptionBase::buildoption, 00048 " path to the sensemap file for coarse senses\n"); 00049 declareOption(ol, "sensemap_level", & GraphicalBiText::sensemap_level, OptionBase::buildoption, 00050 " level of sense grouping 1=all grouped 99 = all separated\n"); 00051 declareOption(ol, "semcor_train_path", & GraphicalBiText::semcor_train_path, OptionBase::buildoption, 00052 " path to the semcor training VMat file\n"); 00053 declareOption(ol, "semcor_valid_path", & GraphicalBiText::semcor_valid_path, OptionBase::buildoption, 00054 " path to the semcor validation VMat file\n"); 00055 declareOption(ol, "semcor_valid2_path", & GraphicalBiText::semcor_valid2_path, OptionBase::buildoption, 00056 " path to a second semcor validation VMat file\n"); 00057 declareOption(ol, "semcor_test_path", & GraphicalBiText::semcor_test_path, OptionBase::buildoption, 00058 " path to the semcor testing VMat file\n"); 00059 declareOption(ol, "update_threshold", & GraphicalBiText::update_threshold, OptionBase::buildoption, 00060 " p(s|e,f) threshold above which the bitext data is used\n"); 00061 declareOption(ol, "output_dir", & GraphicalBiText::output_dir, OptionBase::buildoption, 00062 " dir for all outputs\n"); 00063 00064 } 00065 00066 void GraphicalBiText::build() 00067 { 00068 inherited::build(); 00069 build_(); 00070 00071 } 00072 00073 void GraphicalBiText::build_() 00074 { 00075 00076 // Used to read files 00077 string line; 00078 vector<string> tokens; 00079 string word; 00080 int id; 00081 SetIterator sit; 00082 00083 00084 alpha_bn= INIT_ALPHA; 00085 alpha_joint = INIT_ALPHA; 00086 00087 // Load Ontology 00088 string wsd_voc = source_path+".voc"; 00089 string synset_file = source_path+".synsets"; 00090 string ontology_file = source_path+".ontology"; 00091 string sense_key_file = source_path+".sense_key"; 00092 00093 ontology = WordNetOntology(wsd_voc, synset_file, ontology_file, sense_key_file,false, false); 00094 // ontology = WordNetOntology(source_voc,1,1,1); 00095 // BOUH : dirty patch... 00096 ontology.fillTempWordToSensesTVecMap(); 00097 00098 00099 00100 // Warning : original sense and synset sizes are kept even if the ontology is pruned later 00101 // since these ss_id are used in VMat 00102 source_wsd_voc_size = ontology.getVocSize(); 00103 source_voc_size = ontology.getVocSize(); 00104 sense_size = ontology.getSenseSize(); 00105 ss_size = ontology.getMaxSynsetId() + 1; 00106 cout << "|original ontology voc| = " << source_wsd_voc_size << endl; 00107 cout << "|original sense| = " << sense_size << endl; 00108 cout << "|original synset| = " << ss_size << endl; 00109 00110 // Store original voc in internal source voc structure 00111 // This voc comes with the ontology 00112 Set all_words = ontology.getAllWords(); 00113 for (sit = all_words.begin(); sit != all_words.end(); ++sit){ 00114 word = ontology.getWord(*sit); 00115 id = ontology.getWordId(word); 00116 source_word_to_id[word]=id; 00117 source_id_to_word[id]=word; 00118 } 00119 00120 cout << "| source voc | = "<< source_word_to_id.size()<< endl; 00121 //Load WSD language vocabulary 00122 // only these words are to be disambiguated 00123 ifstream if_voc(source_voc.c_str()); 00124 if (!if_voc) PLERROR("can't open %s", source_voc.c_str()); 00125 Set words_to_be_kept; 00126 int oov_id = ontology.getWordId(OOV_TAG); 00127 00128 int wn_id; 00129 while (!if_voc.eof()){ 00130 getline(if_voc, line, '\n'); 00131 if (line == "") continue; 00132 if (line[0] == '#' && line[1] == '#') continue; 00133 tokens = split(line, " "); 00134 if (tokens.size() != 1) PLERROR("target vocabulary file format error (line = '%s')", line.c_str()); 00135 wn_id = ontology.getWordId(tostring(tokens[0])); 00136 if (wn_id==oov_id && tostring(tokens[0])!=OOV_TAG){ 00137 PLWARNING("word to disambiguate is not in the ontology %s", line.c_str()); 00138 }else{ 00139 words_to_be_kept.insert(wn_id); 00140 } 00141 } 00142 if_voc.close(); 00143 // Remove unwanted words 00144 for (sit = all_words.begin(); sit != all_words.end(); ++sit){ 00145 if( words_to_be_kept.find(*sit)== words_to_be_kept.end()){ 00146 // remove 00147 ontology.removeWord(*sit); 00148 } 00149 } 00150 ontology.removeNonReachableSynsets(); 00151 00152 cout << "|pruned ontology voc| = " << ontology.getVocSize()<<endl; 00153 cout << "|pruned sense| = " << ontology.getSenseSize() << endl; 00154 cout << "|pruned synset| = " << ontology.getMaxSynsetId() + 1 << endl; 00155 00156 00157 //Load WSD target language vocabulary 00158 00159 ifstream if_tvoc(target_voc.c_str()); 00160 if (!if_tvoc) PLERROR("can't open %s", target_voc.c_str()); 00161 int next_id=0; 00162 while (!if_tvoc.eof()) { 00163 getline(if_tvoc, line, '\n'); 00164 if (line == "") continue; 00165 if (line[0] == '#' && line[1] == '#') continue; 00166 tokens = split(line, " "); 00167 if (tokens.size() != 1) PLERROR("target vocabulary file format error (line = '%s')", line.c_str()); 00168 target_id_to_word[next_id]=tostring(tokens[0]); 00169 target_word_to_id[tostring(tokens[0])]=next_id; 00170 target_wsd_voc.insert(next_id); 00171 next_id++; 00172 } 00173 if_tvoc.close(); 00174 // Add OV if necessary 00175 if (target_word_to_id.find(OOV_TAG)==target_word_to_id.end()){ 00176 target_word_to_id[OOV_TAG]=next_id; 00177 cout << " add OOV to target vocabulary " << endl; 00178 next_id++; 00179 } 00180 target_wsd_voc_size = target_wsd_voc.size(); 00181 target_voc_size = target_wsd_voc_size; 00182 cout << "|WSD target voc| = " <<target_wsd_voc_size<<endl; 00183 00184 00185 loadBitext(train_file, valid_file,0); 00186 cout << "|target voc| = " << target_voc_size<<endl; 00187 cout << "|source voc| = " << source_voc_size<<endl; 00188 00189 00190 // common node structure allocation 00191 commNode.resize(source_wsd_voc_size,target_wsd_voc_size); 00192 00193 00194 00195 // Probability matrices allocations 00196 00197 // Debuging variables 00198 sum_epEC.resize(ss_size); 00199 sum_fpFC.resize(ss_size); 00200 sum_cpC.resize(ss_size); 00201 00202 pMC.resize(ss_size); 00203 pC.resize(ss_size); 00204 pTC.resize(ss_size); 00205 pA.resize(ss_size); 00206 nA.resize(ss_size); 00207 00208 // Sense table : P(S) 00209 pS.resize(ss_size); 00210 pSbase.resize(ss_size); 00211 pSupbi.resize(ss_size); 00212 nS.resize(ss_size); 00213 // Src voc : P(E) 00214 pEbase.resize(source_wsd_voc_size); 00215 pE.resize(source_wsd_voc_size); 00216 // Context 00217 pH.resize(source_voc_size); 00218 pHbase.resize(source_voc_size); 00219 pHupbi.resize(source_voc_size); 00220 // Target vc : P(F) 00221 pF.resize(target_wsd_voc_size); 00222 00223 // Graphical model variables 00224 // target_voc - Sense table : P(F|S) 00225 nFS.resize(target_wsd_voc_size,ss_size);nFS.setName("nFS");nFS.setMode(COLUMN_WISE); 00226 pFS.resize(target_wsd_voc_size,ss_size);pFS.setName("pFS");pFS.setMode(COLUMN_WISE); 00227 00228 // source_Voc - Sense table : P(E|S) 00229 nES.resize(source_wsd_voc_size,ss_size);nES.setName("nES");nES.setMode(COLUMN_WISE); 00230 pES.resize(source_wsd_voc_size,ss_size);pES.setName("pES");pES.setMode(COLUMN_WISE); 00231 00232 00233 // Entropy computation 00234 nSE.resize(ss_size,source_wsd_voc_size);nSE.setName("nSE");nSE.setMode(COLUMN_WISE); 00235 pSE.resize(ss_size,source_wsd_voc_size);pSE.setName("pSE");pSE.setMode(COLUMN_WISE); 00236 nSEbi.resize(ss_size,source_wsd_voc_size);nSEbi.setName("nSE");nSEbi.setMode(COLUMN_WISE); 00237 pSEbi.resize(ss_size,source_wsd_voc_size);pSEbi.setName("pSE");pSEbi.setMode(COLUMN_WISE); 00238 KL.resize(source_wsd_voc_size); 00239 BiSelect.clear(); 00240 00241 // NaiveBayes model 00242 pESbase.resize(source_wsd_voc_size,ss_size);pESbase.setName("pESbase");pESbase.setMode(COLUMN_WISE); 00243 pESupbi.resize(source_wsd_voc_size,ss_size);pESupbi.setName("pESupbi");pESupbi.setMode(COLUMN_WISE); 00244 nESbase.resize(source_wsd_voc_size,ss_size);nESbase.setName("nESbase");nESbase.setMode(COLUMN_WISE); 00245 nESupbi.resize(source_wsd_voc_size,ss_size);nESupbi.setName("nESupbi");nESupbi.setMode(COLUMN_WISE); 00246 00247 // context - Sense table : P(H|S) 00248 nHS.resize(source_voc_size,ss_size);nHS.setName("nHS");nHS.setMode(COLUMN_WISE); 00249 pHS.resize(source_voc_size,ss_size);pHS.setName("pHS");pHS.setMode(COLUMN_WISE); 00250 nHSupbi.resize(source_voc_size,ss_size);nHSupbi.setName("nHSupbi");nHSupbi.setMode(COLUMN_WISE); 00251 pHSupbi.resize(source_voc_size,ss_size);pHSupbi.setName("pHSupbi");pHSupbi.setMode(COLUMN_WISE); 00252 00253 pEF.resize(source_wsd_voc_size,target_wsd_voc_size);pEF.setMode(COLUMN_WISE); 00254 nEF.resize(source_wsd_voc_size,target_wsd_voc_size);nEF.setMode(COLUMN_WISE); 00255 00256 00257 init(); 00258 init_WSD(); 00259 } 00260 00261 void GraphicalBiText::loadBitext(string train_file_name, string valid_file_name,bool update_voc) 00262 { 00263 ifstream ifs1,ifs2; 00264 int if1_nb_lines=0; 00265 int if2_nb_lines=0; 00266 00267 int nb_line; 00268 string line; 00269 vector<string> tokens; 00270 ShellProgressBar progress; 00271 string src_word,src_stem_word,tgt_word; 00272 int src_word_id,tgt_word_id; 00273 int tgt_next_id = target_voc_size; 00274 // int src_next_id = source_voc_size; 00275 00276 // Train file 00277 ifs1.open(train_file_name.c_str()); 00278 if (!ifs1) PLERROR("load_bitext : can't open %s", train_file_name.c_str()); 00279 if1_nb_lines = ShellProgressBar::getAsciiFileLineCount(train_file_name); 00280 train_bitext_tgt.resize(if1_nb_lines); 00281 train_bitext_src.resize(if1_nb_lines); 00282 00283 // Valid file 00284 ifs2.open(valid_file_name.c_str()); 00285 if (!ifs2) PLERROR("load_bitext : can't open %s", valid_file_name.c_str()); 00286 if2_nb_lines = ShellProgressBar::getAsciiFileLineCount(valid_file_name); 00287 valid_bitext_tgt.resize(if2_nb_lines); 00288 valid_bitext_src.resize(if2_nb_lines); 00289 00290 // Load train 00291 progress.set(0, if1_nb_lines, "Loading "+train_file_name, 50); 00292 progress.init(); 00293 progress.draw(); 00294 nb_line = 0; 00295 while (!ifs1.eof()) { 00296 getline(ifs1, line, '\n'); 00297 if (line == "") continue; 00298 if (line[0] == '#' && line[1] == '#') continue; 00299 tokens = split(line, " "); 00300 // file must be in GIZA++ format : tgt_word src_word 00301 if (tokens.size() != 2) PLERROR("format error : file %s (line = '%s')", train_file_name.c_str(),line.c_str()); 00302 tgt_word = tostring(tokens[0]); 00303 src_word = tostring(tokens[1]); 00304 if(update_voc){ 00305 if ( target_word_to_id.find(tgt_word) == target_word_to_id.end()){ 00306 target_id_to_word[tgt_next_id]=tgt_word; 00307 target_word_to_id[tgt_word]=tgt_next_id; 00308 tgt_word_id = tgt_next_id; 00309 tgt_next_id++; 00310 }else{ 00311 tgt_word_id= target_word_to_id[tgt_word]; 00312 } 00313 if (source_word_to_id.find(src_word) == source_word_to_id.end()){ 00314 // Do not update src voc 00315 //source_id_to_word[src_next_id]=src_word; 00316 //source_word_to_id[src_word]=src_next_id; 00317 //src_word_id = src_next_id; 00318 //src_next_id++; 00319 src_word_id=source_word_to_id[OOV_TAG]; 00320 }else{ 00321 src_word_id=source_word_to_id[src_word]; 00322 } 00323 }else{ 00324 if ( target_word_to_id.find(tgt_word) == target_word_to_id.end()){ 00325 tgt_word_id=target_word_to_id[OOV_TAG]; 00326 }else{ 00327 tgt_word_id= target_word_to_id[tgt_word]; 00328 } 00329 if ( source_word_to_id.find(src_word) == source_word_to_id.end()){ 00330 src_word_id=source_word_to_id[OOV_TAG]; 00331 }else 00332 src_word_id=source_word_to_id[src_word]; 00333 } 00334 train_bitext_tgt[nb_line]=tgt_word_id; 00335 train_bitext_src[nb_line]=src_word_id; 00336 nb_line++; 00337 progress.update(nb_line); 00338 } 00339 progress.done(); 00340 if (update_voc){ 00341 target_voc_size = tgt_next_id; 00342 } 00343 00344 // do not update valid voc 00345 update_voc = false; 00346 00347 // Load valid 00348 progress.set(0, if2_nb_lines, "Loading "+valid_file_name, 50); 00349 progress.init(); 00350 progress.draw(); 00351 nb_line = 0; 00352 while (!ifs2.eof()) { 00353 getline(ifs2, line, '\n'); 00354 if (line == "") continue; 00355 if (line[0] == '#' && line[1] == '#') continue; 00356 tokens = split(line, " "); 00357 // file must be in GIZA++ format : tgt_word src_word 00358 if (tokens.size() != 2) PLERROR("format error : file %s (line = '%s')", valid_file_name.c_str(),line.c_str()); 00359 tgt_word = tostring(tokens[0]); 00360 src_word = tostring(tokens[1]); 00361 if(update_voc){ 00362 if ( target_word_to_id.find(tgt_word) == target_word_to_id.end()){ 00363 target_id_to_word[tgt_next_id]=tgt_word; 00364 target_word_to_id[tgt_word]=tgt_next_id; 00365 tgt_word_id = tgt_next_id; 00366 tgt_next_id++; 00367 }else{ 00368 tgt_word_id= target_word_to_id[tgt_word]; 00369 } 00370 if (source_word_to_id.find(src_word) == source_word_to_id.end()){ 00371 // Do not update src voc 00372 //source_id_to_word[src_next_id]=src_word; 00373 //source_word_to_id[src_word]=src_next_id; 00374 //src_word_id = src_next_id; 00375 //src_next_id++; 00376 src_word_id=source_word_to_id[OOV_TAG]; 00377 }else{ 00378 src_word_id=source_word_to_id[src_word]; 00379 } 00380 }else{ 00381 if ( target_word_to_id.find(tgt_word) == target_word_to_id.end()){ 00382 tgt_word_id=target_word_to_id[OOV_TAG]; 00383 }else{ 00384 tgt_word_id= target_word_to_id[tgt_word]; 00385 } 00386 if ( source_word_to_id.find(src_word) == source_word_to_id.end()){ 00387 src_word_id=source_word_to_id[OOV_TAG]; 00388 }else 00389 src_word_id=source_word_to_id[src_word]; 00390 } 00391 valid_bitext_tgt[nb_line]=tgt_word_id; 00392 valid_bitext_src[nb_line]=src_word_id; 00393 nb_line++; 00394 progress.update(nb_line); 00395 } 00396 if (update_voc){ 00397 target_voc_size = tgt_next_id; 00398 } 00399 progress.done(); 00400 00401 00402 00403 } 00404 00405 void GraphicalBiText::init_WSD() 00406 { 00407 int i,e,s,si,pos,k,h; 00408 string skey; 00409 n_fields = 6 * window_size+3; 00410 int oov_id = ontology.getWordId(OOV_TAG); 00411 Vec row_data; 00412 row_data.resize(n_fields); 00413 for (i = 0; i < wsd_train.length(); i++){ 00414 wsd_train->getRow(i, row_data); 00415 e = (int)row_data[n_fields-3]; 00416 si = (int) row_data[n_fields-2]; 00417 // map the sense 00418 s = si; 00419 skey = ontology.getSenseKey(e,si); 00420 if (si>0 && sensemap.find(skey)!=sensemap.end())s=ontology.getSynsetIDForSenseKey(e, sensemap[skey]); 00421 pos = (int)row_data[n_fields-1]; 00422 00423 // only consider supervised examples and words in the disambiguation model 00424 if (e<0 || e == oov_id)continue; 00425 if (s>0 && ontology.isWord(e)&&ontology.isSense(s)){ 00426 if (pos!=NOUN_TYPE)continue; 00427 //if (pos!=VERB_TYPE)continue; 00428 // Naive Bayes model 00429 nESbase.incr(e,s); 00430 pSbase[s]++; 00431 pEbase[e]++; 00432 00433 if(window_size!=0){ 00434 // consider the context 00435 for (k = 0; k < 2 * window_size; k++){ 00436 h = (int)row_data[3*k]; 00437 if (h<0 || h == oov_id)continue; 00438 nHS.incr(h,s); 00439 pH[h]++; 00440 pHbase[h]++; 00441 } 00442 } 00443 } 00444 } 00445 00446 // Naive Bayes model 00447 pEbase.smoothNormalize("pEbase"); 00448 pESbase.normalizeCondBackoff(nESbase,0.1,pEbase,false,false); 00449 pESupbi.normalizeCondBackoff(nESbase,0.1,pEbase,false,false); 00450 pSbase.smoothNormalize("pSbase",0.1); 00451 00452 00453 if(window_size!=0){ 00454 pH.smoothNormalize("pH"); 00455 //pHS.normalizeCond(nHS,false); 00456 pHS.normalizeCondBackoff(nHS, 0.1,pH,false,false); 00457 pHSupbi.normalizeCondBackoff(nHS,0.1,pH,false,false); 00458 //pHSupbi.normalizeCond(nHS,false); 00459 } 00460 00461 } 00462 00463 VMat loadToVMat(string file,string name, int window, int n_examples) 00464 { 00465 // open disk vmat 00466 VMat dvm = new DiskVMatrix(file); 00467 // extract a subset if wanted 00468 VMat sub_dvm = new SubVMatrix(dvm, 0, 0, (n_examples < 0 ? dvm->length() : n_examples) , dvm->width()); 00469 // load into memory mat 00470 Mat m(sub_dvm.length(), sub_dvm.width()); 00471 ShellProgressBar progress(0, m.length()-1, "Loading "+name, 50); 00472 progress.draw(); 00473 for(int i=0; i<m.length(); i++){ 00474 sub_dvm->getRow(i,m(i)); 00475 progress.update(i); 00476 } 00477 progress.done(); 00478 cout << m.length() << " lines found"<<endl; 00479 // transform int vmat 00480 VMat vm(m); 00481 VMat tvm = new TextSenseSequenceVMatrix(vm, 2*window); 00482 return tvm; 00483 } 00484 00485 00486 void GraphicalBiText::init() 00487 { 00488 00489 n_fields = 6 * window_size+3; 00490 string skey; 00491 string line; 00492 vector<string> tokens; 00493 string src_word,src_stem_word,tgt_word; 00494 int src_word_id,tgt_word_id; 00495 int c,s,si,e,f,i,j,pos; 00496 map <int,int> nb_translation; 00497 int oov_id = ontology.getWordId(OOV_TAG); 00498 int nbMap=0; 00499 Set src_senses,ss_anc; 00500 SetIterator sit; 00501 Vec row_data; 00502 row_data.resize(n_fields); 00503 ShellProgressBar progress; 00504 float maxp=0; 00505 float p; 00506 int maxs=0; 00507 pES.clear(); 00508 00509 00510 // Load SemCor 00511 wsd_train = loadToVMat (semcor_train_path,"Semcor_train",window_size,-1); 00512 //wsd_train = loadToVMat (senseval2_train_path,"Senseval_train",window_size,-1); 00513 wsd_valid = loadToVMat (semcor_valid_path,"Semcor_valid",window_size,-1); 00514 wsd_valid2 = loadToVMat (semcor_valid2_path,"Semcor_valid2",window_size,-1); 00515 wsd_test = loadToVMat (semcor_test_path,"Semcor_test",window_size,-1); 00516 //wsd_test = loadToVMat (semcor_train_path,"Semcor_train",window_size,-1); 00517 // load Senseval2 00518 senseval2_train = loadToVMat (senseval2_train_path,"Senseval_train",window_size,-1); 00519 00520 TVec < set<int> > f_possible_senses(target_wsd_voc_size); 00521 Vec symscore(ss_size); 00522 cout << "|train| = " << wsd_train.length()<< endl; 00523 00524 // Read two times : 1st to build sensemap, 2nd to initialize the model */ 00525 for (i = 0; i < wsd_train.length(); i++){ 00526 wsd_train->getRow(i, row_data); 00527 e = (int)row_data[n_fields-3]; 00528 s = (int) row_data[n_fields-2]; 00529 pos = (int)row_data[n_fields-1]; 00530 00531 // only consider supervised examples and words in the disambiguation model 00532 if (e<0 || e == oov_id)continue; 00533 if (s>0 && ontology.isWord(e)&&ontology.isSense(s)){ 00534 if (pos!=NOUN_TYPE)continue; 00535 // if (pos!=VERB_TYPE)continue; 00536 nSE.incr(s,e); 00537 pS[s]++; 00538 } 00539 } 00540 pS.normalize(); 00541 pSE.normalizeCond(nSE, false); 00542 00543 sensemap.clear(); 00544 if(sensemap_level>0)compute_nodemap(sensemap_level); 00545 //print_sensemap(); 00546 00547 pS.clear(); 00548 nSE.clear(); 00549 00550 for (i = 0; i < wsd_train.length(); i++){ 00551 wsd_train->getRow(i, row_data); 00552 e = (int)row_data[n_fields-3]; 00553 si = (int) row_data[n_fields-2]; 00554 // map the sense 00555 s = si; 00556 skey = ontology.getSenseKey(e,si); 00557 if (sensemap_level>0 && si>0 && sensemap.find(skey)!=sensemap.end()){ 00558 s=ontology.getSynsetIDForSenseKey(e, sensemap[skey]); 00559 nbMap++; 00560 } 00561 00562 pos = (int)row_data[n_fields-1]; 00563 // only consider supervised examples and words in the disambiguation model 00564 if (e<0 || e == oov_id)continue; 00565 if (s>0 && ontology.isWord(e)&&ontology.isSense(s)){ 00566 if (pos!=NOUN_TYPE)continue; 00567 // if (pos!=VERB_TYPE)continue; 00568 nES.incr(e, s); 00569 pS[s]++; 00570 nSE.incr(s,e); 00571 } 00572 } 00573 cout <<"INIT "<<nbMap<<" mapping done"<<endl; 00574 // Normalize 00575 00576 pES.normalizeCond(nES, false); 00577 pS.normalize(); 00578 pSE.normalizeCond(nSE, false); 00579 00580 // Compute pTC 00581 // must be after pS.normalize and before compute commNode 00582 compute_pTC(); 00583 00584 00585 //Initialize pEF and nFS 00586 // read from word train bitext file 00587 progress.set(0, train_bitext_tgt.size(), "INIT_initialize_nFS_nEF", 50); 00588 progress.init(); 00589 progress.draw(); 00590 00591 for (i=0;i< train_bitext_tgt.size();i++){ 00592 tgt_word_id=(int)train_bitext_tgt[i]; 00593 src_word_id=(int)train_bitext_src[i]; 00594 00595 // consider only words to be disambiguated 00596 if(ontology.isWord(src_word_id) && target_wsd_voc.find(tgt_word_id)!=target_wsd_voc.end()){ 00597 00598 // nEF 00599 nEF.incr(src_word_id,tgt_word_id);// update P(E,F) 00600 pE[src_word_id]++; 00601 pF[tgt_word_id]++; 00602 00603 //nFS 00604 // The set of possible senses for F is the same as its translation E 00605 // P(F|S) is uniformly distributed 00606 // WARNING : change getWordNounSense after debug 00607 src_senses = ontology.getWordNounSenses(src_word_id);// see also further NOUN 00608 //src_senses = ontology.getWordVerbSenses(src_word_id); 00609 // src_senses = ontology.getWordSenses(src_word_id); 00610 maxp=0; 00611 maxs=0; 00612 for (sit = src_senses.begin(); sit != src_senses.end(); ++sit){ 00613 real src_sense_proba = pES.get(src_word_id,*sit); 00614 00615 if (src_sense_proba!=0){ 00616 // First solution 00617 // all the senses of e are possible for (e,f) 00618 00619 // Uniform distribution 00620 nFS.set(tgt_word_id,*sit,1); 00621 // Same distribution as pES 00622 // nFS.incr(tgt_word_id,*sit,src_sense_proba); 00623 // update target word to senses map 00624 target_word_to_senses[tgt_word_id].insert(*sit); 00625 00626 00627 // Second solution : sense selection according to similarity 00628 // if(f_possible_senses[tgt_word_id].find(*sit)==f_possible_senses[tgt_word_id].end()){ 00629 // f_possible_senses[tgt_word_id].insert(*sit); 00630 //} 00631 } 00632 // Third solution : consider only most probable senses of the translation 00633 // compute most likely sense 00634 // p = pES(src_word_id,*sit)*pS[*sit]; 00635 //if(maxp<p){ 00636 // maxp = p; 00637 // maxs = *sit; 00638 //} 00639 } 00640 //f_possible_senses[tgt_word_id].insert(maxs); 00641 } 00642 progress.update(i); 00643 } 00644 progress.done(); 00645 00646 00647 00648 // Select f senses 00649 cout << "Init:attach french words"<<endl; 00650 compute_node_level(); 00651 for ( f = 0; f<target_wsd_voc_size;f++){ 00652 cout<<target_id_to_word[f]<<endl; 00653 00654 if(nEF.sumCol(f)==1){ 00655 // only one translation available for f 00656 // use most likely sense of this translation 00657 map<int, real>& col_f = nEF.getCol(f); 00658 map<int, real>::iterator it = col_f.begin(); 00659 e = it->first; 00660 maxp=0; 00661 maxs=0; 00662 for(set<int>::iterator lit1=f_possible_senses[f].begin(); lit1 != f_possible_senses[f].end(); lit1++){ 00663 s = *lit1; 00664 p = pES(e,s)*pS[s]; 00665 if(maxp<p){ 00666 maxp = p; 00667 maxs = i; 00668 } 00669 } 00670 }else{ 00671 00672 00673 symscore.clear(); 00674 for(set<int>::iterator lit1=f_possible_senses[f].begin(); lit1 != f_possible_senses[f].end(); lit1++){ 00675 i = *lit1; 00676 for(set<int>::iterator lit2=f_possible_senses[f].begin(); lit2 != f_possible_senses[f].end(); lit2++){ 00677 j = *lit2; 00678 if(i==j)continue; 00679 c = getDeepestCommonAncestor(i,j); 00680 symscore[i]+= -log(pTC[c]); 00681 //symscore[i]+=node_level[c]+pTC[c]; 00682 // cout <<" i="<<i<<" j="<<j<<" c="<<c<<" sym="<<-log(pTC[c])<<endl; 00683 // cout <<" i="<<i<<" j="<<j<<" c="<<c<<" sym="<<node_level[c]+pTC[c]<<endl; 00684 } 00685 } 00686 int nb_fr_sense =10; 00687 for(i=0;i<nb_fr_sense;i++){ 00688 if(symscore.size()!=0){ 00689 si = argmax(symscore); 00690 symscore[si]=0; 00691 if(si!=0){ 00692 cout <<target_id_to_word[f]<<" argmax="<<si<<" ";ontology.printSynset(si); 00693 nFS.set(f,si,1); 00694 // update target word to senses map 00695 target_word_to_senses[f].insert(si); 00696 } 00697 } 00698 } 00699 } 00700 } 00701 // Normalize 00702 pFS.normalizeCond(nFS, false); 00703 00704 00705 // Compute commNode 00706 int deepestComNode; 00707 Set e_senses; 00708 Set f_senses; 00709 progress.set(0,source_wsd_voc_size*target_wsd_voc_size , "INIT_compute_commNode", 50); 00710 progress.init(); 00711 progress.draw(); 00712 i = 0; 00713 // For each source word 00714 Set e_words=ontology.getAllWords(); 00715 for (sit = e_words.begin(); sit != e_words.end(); ++sit){ 00716 e = *sit; 00717 00718 e_senses = ontology.getWordNounSenses(e); 00719 //e_senses = ontology.getWordVerbSenses(e); 00720 00721 // For each target word 00722 for ( f = 0; f<target_wsd_voc_size;f++){ 00723 00724 f_senses = target_word_to_senses[f]; 00725 // For each sens of the current source word 00726 for(SetIterator esit=e_senses.begin(); esit!=e_senses.end();++esit){ 00727 if (pES.get(e,*esit)==0)continue; 00728 // For each sens of the current target word 00729 for (SetIterator fsit = f_senses.begin(); fsit != f_senses.end(); ++fsit){ 00730 if (pFS.get(f,*fsit)==0)continue; 00731 deepestComNode = getDeepestCommonAncestor(*esit,*fsit); 00732 //cout << "commNode "<< e << " " << *esit <<" " << f << " " << *fsit << " " <<deepestComNode <<endl; 00733 commNode(e,f).insert(deepestComNode); 00734 // nb_commNode = commNode(e,f).size();commNode(e,f)[nb_commNode]=deepestComNode; 00735 sens_to_conceptAncestors[*esit].insert(deepestComNode); 00736 sens_to_conceptAncestors[*fsit].insert(deepestComNode); 00737 if (pTC[deepestComNode]==0) PLERROR("compute_commNode : pTC[common ancestor]==0"); 00738 // Init pA 00739 pA[deepestComNode]= INIT_P_A; 00740 } 00741 } 00742 i++; 00743 progress.update(i); 00744 00745 } 00746 } 00747 progress.done(); 00748 00749 00750 00751 check_set_pA(); 00752 compute_pMC(); 00753 00754 // Joint model 00755 pEF.normalizeJoint(nEF); 00756 // Independant model 00757 pE.smoothNormalize("pE"); 00758 pF.smoothNormalize("pF"); 00759 00760 00761 check_consitency(); 00762 00763 return; 00764 00765 } 00766 int GraphicalBiText::getDeepestCommonAncestor(int s1, int s2) 00767 { 00768 list<int> candidates; 00769 int cand; 00770 Node* candNode; 00771 Node* ss2 ; 00772 SetIterator it; 00773 Set s1_ancestors; 00774 // WARNING : this interpretation of the ontology may not fit all applications 00775 // if a node is both a sense and a category 00776 // it is virtually split into two nodes : the sense node and the category node 00777 // the virtual category node is parent of the virtual sens node 00778 00779 s1_ancestors = ontology.getSynsetAncestors ( s1,-1); 00780 // if s1 is not a pure sense 00781 if (pTC[s1]!=0){ 00782 s1_ancestors.insert(s1); 00783 } 00784 // if s2 is not a pure sense 00785 if(pTC[s2]!=0){ 00786 candidates.push_back(s2); 00787 } 00788 00789 ss2 = ontology.getSynset(s2); 00790 // add s2's parents to candidate list 00791 for (it = ss2->parents.begin(); it != ss2->parents.end(); ++it){ 00792 candidates.push_back(*it); 00793 } 00794 // Breadth first search starting from s2 and going upward the ontology. 00795 while(!candidates.empty()){ 00796 cand = candidates.front(); 00797 candidates.pop_front(); 00798 if (s1_ancestors.find(cand)!=s1_ancestors.end()){ 00799 return cand; 00800 }else{ 00801 candNode = ontology.getSynset(cand); 00802 // add cand's parents to candidate list 00803 for (it = candNode->parents.begin(); it != candNode->parents.end(); ++it){ 00804 candidates.push_back(*it); 00805 } 00806 } 00807 } 00808 PLERROR("No common ancestor for %d and %d",s1,s2); 00809 return 0; 00810 } 00811 00812 /* Compute the probability of the subtree rooted at each node pTC */ 00813 /* pTC(c) = sum_{s\in subtree rooted c}P(s) */ 00814 void GraphicalBiText::compute_pTC() 00815 { 00816 SetIterator sit; 00817 pTC.clear(); 00818 Set ss_set=ontology.getAllCategories(); 00819 int s; 00820 // loop on all synsets which are not pure category 00821 for (sit = ss_set.begin(); sit != ss_set.end(); ++sit){ 00822 s = *sit; 00823 if (ontology.isPureCategory(s))continue; 00824 if (ontology.isPureSense(s)){ 00825 pTC[s]=0; 00826 }else{ 00827 // this synset is both a sense and a category 00828 pTC[s]+=pS[s]; 00829 } 00830 distribute_pS_on_ancestors(s,pS[s]); 00831 } 00832 } 00833 00834 00835 /* Compute the probability of the subtree rooted at each node pTC */ 00836 /* considering only the senses of a given word w */ 00837 /* pTC(c) = sum_{s of w \in subtree rooted c}P(s) */ 00838 void GraphicalBiText::compute_pTC(int word) 00839 { 00840 SetIterator sit; 00841 pTC.clear(); 00842 Set w_senses ; 00843 Set ss_set=ontology.getAllCategories(); 00844 int s; 00845 // loop on all synsets which are not pure category 00846 for (sit = ss_set.begin(); sit != ss_set.end(); ++sit){ 00847 s = *sit; 00848 if (ontology.isPureCategory(s))continue; 00849 w_senses = ontology.getWordSenses(word); 00850 if(w_senses.find(s)==w_senses.end())continue; 00851 if (ontology.isPureSense(s)){ 00852 pTC[s]=0; 00853 }else{ 00854 // this synset is both a sense and a category 00855 pTC[s]+=pS[s]; 00856 } 00857 distribute_pS_on_ancestors(s,pS[s]); 00858 } 00859 } 00860 00861 00862 00863 void GraphicalBiText::distribute_pS_on_ancestors(int s,real probaToDistribute) 00864 { 00865 real proba; 00866 Set ss_anc; 00867 SetIterator sit; 00868 ss_anc = ontology.getSynsetParents(s); 00869 // if the node has more than one parent, distribute equally the proba on each of them 00870 proba = probaToDistribute/ss_anc.size(); 00871 for ( sit = ss_anc.begin(); sit != ss_anc.end(); ++sit){ 00872 pTC[*sit]+=proba; 00873 distribute_pS_on_ancestors(*sit,proba); 00874 } 00875 } 00876 00877 00878 // void GraphicalBiText::compute_pTC_old() 00879 // { 00880 // real sum_cT; 00881 // SetIterator sit; 00882 // pTC.clear(); 00883 // Set ss_set=ontology.getAllCategories(); 00884 // // loop on all synsets 00885 // for (SetIterator ssit = ss_set.begin(); ssit != ss_set.end(); ++ssit){ 00886 // if (ontology.isPureSense(*ssit)){ 00887 // pTC[*ssit]=0; 00888 // }else { 00889 // sum_cT=0; 00890 // Set ss_desc; 00891 // Node* node = ontology.getSynset(*ssit); 00892 // ontology.extractStrictDescendants(node, ss_desc, Set()); 00893 // //ontology.extractDescendants(node, ss_desc, Set()); 00894 // // loop on all senses descendant of the current synset 00895 // for (sit = ss_desc.begin(); sit != ss_desc.end(); ++sit){ 00896 // if (ontology.isPureCategory(*sit)){ 00897 // PLERROR("compute pTC : try to sum on non sense id "); 00898 // }else{ 00899 // // only senses have pS!=0 00900 // sum_cT+=pS[*sit]; 00901 // } 00902 // } 00903 // if (sum_cT!=0 && pS[*ssit]!=0)sum_cT+=pS[*ssit]; 00904 // pTC[*ssit]=sum_cT; 00905 // } 00906 // } 00907 // } 00908 void GraphicalBiText::compute_node_level() 00909 { 00910 list<int> desc;// descendant list 00911 SetIterator sit,ssit; 00912 Set ss_anc; 00913 Node *node; 00914 bool incomplete; 00915 int s, max_level,par; 00916 node = ontology.getSynset(ROOT_SS_ID); 00917 // add children of the root to the list 00918 for (sit = node->children.begin(); sit != node->children.end(); ++sit){ 00919 if (pTC[*sit]==0)continue; 00920 desc.push_back(*sit); 00921 //cout << " * " << *sit; 00922 } 00923 node_level[ROOT_SS_ID]=1; 00924 for(list<int>::iterator lit=desc.begin(); lit != desc.end(); lit++){ 00925 s = *lit; 00926 00927 if(pMC[s]!=0)continue; 00928 // no probability in the subtree : this part of the tree is useless 00929 if (pTC[s]==0)continue; 00930 // extract parents 00931 node = ontology.getSynset(s); 00932 ss_anc.clear(); 00933 ontology. extractAncestors(node, ss_anc, 1, 1); 00934 max_level = 0; 00935 incomplete=0; 00936 for (ssit = ss_anc.begin(); ssit != ss_anc.end(); ++ssit){ 00937 par = *ssit; 00938 if (node_level[par]==0){ PLWARNING("tried to compute level for a node (%d) and level for its parent (%d) is not computed",s,*ssit); 00939 incomplete=true; 00940 break; 00941 } 00942 if (node_level[par]>max_level)max_level = node_level[par]; 00943 } 00944 if(!incomplete){ 00945 node_level[s]=max_level+1; 00946 node = ontology.getSynset(s); 00947 // add sense children of s to the list 00948 for (sit = node->children.begin(); sit != node->children.end(); ++sit){ 00949 if (!ontology.isSynset(*sit))continue; 00950 desc.push_back(*sit); 00951 //cout << " * " << *sit; 00952 } 00953 }else{ 00954 // will try later 00955 desc.push_back(s); 00956 } 00957 } 00958 } 00959 00960 void GraphicalBiText::compute_pMC() 00961 { 00962 list<int> desc;// descendant list 00963 SetIterator sit,ssit; 00964 Set ss_anc; 00965 Node *node,*node_par; 00966 bool incomplete; 00967 int s,par; 00968 real proba_mass; 00969 real sum_pTC_par; 00970 real check_sum=0; 00971 // erase previous values 00972 pMC.clear(); 00973 pC.clear(); 00974 node = ontology.getSynset(ROOT_SS_ID); 00975 // add children of the root to the list 00976 for (sit = node->children.begin(); sit != node->children.end(); ++sit){ 00977 if (pTC[*sit]==0)continue; 00978 desc.push_back(*sit); 00979 //cout << " * " << *sit; 00980 } 00981 // Set pMC and pC for root 00982 pMC[ROOT_SS_ID]=1; 00983 pC[ROOT_SS_ID]=pMC[ROOT_SS_ID]*pA[ROOT_SS_ID]; 00984 check_sum= pC[ROOT_SS_ID]; 00985 for(list<int>::iterator lit=desc.begin(); lit != desc.end(); lit++){ 00986 incomplete = false; 00987 s = *lit; 00988 //cout << " / " << s; 00989 // pMC already computed for this node 00990 // this is possible since the ontology is not a tree but a dag 00991 if(pMC[s]!=0)continue; 00992 // no probability in the subtree : this part of the tree is useless 00993 if (pTC[s]==0)continue; 00994 // extract parents 00995 node = ontology.getSynset(s); 00996 ss_anc.clear(); 00997 ontology. extractAncestors(node, ss_anc, 1, 1); 00998 proba_mass = 0; 00999 for (ssit = ss_anc.begin(); ssit != ss_anc.end(); ++ssit){ 01000 par = *ssit; 01001 if (pMC[par]==0){ PLWARNING("tried to compute pMC for a node (%d) and pMC for its parent (%d) is not computed",s,*ssit); ontology.printSynset(*ssit);incomplete=true;break;} 01002 // Compute sum_{children of parent}pTC(children) 01003 sum_pTC_par=0; 01004 node_par = ontology.getSynset(par); 01005 for (sit = node_par->children.begin(); sit != node_par->children.end(); ++sit){ 01006 sum_pTC_par+=pTC[*sit]; 01007 } 01008 proba_mass+=pMC[par]*(1.0-pA[par])*pTC[s]/sum_pTC_par; 01009 //cout << "(" << *ssit << ") " <<pMC[par]*(1.0-pA[par])*pTC[s]/sum_pTC_par << " + "; 01010 } 01011 01012 if (incomplete){ 01013 // impossible to compute pMC now : will try later 01014 pMC[s]=0; 01015 desc.push_back(s); 01016 //cout << " * " << s; 01017 }else{ 01018 node = ontology.getSynset(s); 01019 // add sense children of s to the list 01020 for (sit = node->children.begin(); sit != node->children.end(); ++sit){ 01021 if (!ontology.isSynset(*sit))continue; 01022 desc.push_back(*sit); 01023 //cout << " * " << *sit; 01024 } 01025 pMC[s]=proba_mass; 01026 pC[s]=pMC[s]*pA[s]; 01027 check_sum+= pC[s]; 01028 //if(pMC[s]==0)PLERROR("pMC[%d] = 0",s); 01029 // cout <<" pMC[" << s<<"]="<< pMC[s]<<; 01030 } 01031 } 01032 } 01033 01034 bool lessPair ( pair<int,float>& p1, pair<int,float>& p2) 01035 { 01036 return p1.second < p2.second; 01037 } 01038 01039 01040 void GraphicalBiText::set_nodemap(int c,int e) 01041 /* go thru the subtree rooted inc */ 01042 /* first seen sense is the common senses */ 01043 /* for all other senses, map it to common sense */ 01044 { 01045 list <int> desc; 01046 SetIterator sit; 01047 int s; 01048 int common_sense=0; 01049 Node *node; 01050 Set e_senses = ontology.getWordSenses(e); 01051 desc.push_back(c); 01052 for(list<int>::iterator lit=desc.begin(); lit != desc.end(); lit++){ 01053 s = *lit; 01054 if (e_senses.find(s)!=e_senses.end() && pSE(s,e)!=0 ){ 01055 // the current node is both sense and category 01056 if(common_sense==0){ 01057 // first sense encountered 01058 common_sense=s; 01059 sensemap[ontology.getSenseKey(e,s)]= ontology.getSenseKey(e,s); 01060 01061 }else{ 01062 // nodemap[s]=common_sense; 01063 sensemap[ontology.getSenseKey(e,s)]= ontology.getSenseKey(e,common_sense); 01064 } 01065 cout << s<<" "<<pSE(s,e)<< " "<<ontology.getSenseKey(e,s) << " -> "<< sensemap[ontology.getSenseKey(e,s)]<<endl; 01066 } 01067 node = ontology.getSynset(s); 01068 for (sit = node->children.begin(); sit != node->children.end(); ++sit){ 01069 if (!ontology.isSynset(*sit))continue; 01070 if (pTC[*sit]==0 && (e_senses.find(*sit)==e_senses.end() || pSE(*sit,e)==0))continue; 01071 desc.push_back(*sit); 01072 } 01073 } 01074 } 01075 01076 void GraphicalBiText::print_sensemap() 01077 { 01078 int e; 01079 SetIterator sit1,sit; 01080 cout << "Print_sensemap"<<endl; 01081 /* for each source word */ 01082 Set e_words=ontology.getAllWords(); 01083 for ( sit1 = e_words.begin(); sit1 != e_words.end(); ++sit1){ 01084 e = *sit1; 01085 cout <<source_id_to_word[e]<<endl; 01086 Set e_senses = ontology.getWordSenses(e); 01087 for (sit = e_senses.begin(); sit != e_senses.end(); ++sit){ 01088 // if(nodemap.find(*sit)==nodemap.end()){ 01089 // sensemap[ontology.getSenseKey(e,*sit)]= ontology.getSenseKey(e,*sit); 01090 //}else{ 01091 //sensemap[ontology.getSenseKey(e,*sit)]= ontology.getSenseKey(e,nodemap[*sit]); 01092 //} 01093 //cout << *sit<<" "<<pS[*sit]<< " "<<ontology.getSenseKey(e,*sit) << " -> "<< sensemap[ontology.getSenseKey(e,*sit)]<<endl; 01094 } 01095 } 01096 } 01097 01098 void GraphicalBiText::compute_nodemap(int level) 01099 /* Waning : compute_nodemap uses pTC and erase all previous values */ 01100 /* the parameter level defined the granularity of the sense map */ 01101 /* the greater the finer */ 01102 { 01103 list<int> desc;// descendant list 01104 SetIterator sit,ssit,sit1; 01105 Set ss_anc; 01106 Set e_senses; 01107 list<pair<int,float> > split_node; 01108 Node *node; 01109 int non_null_child; 01110 float max_level; 01111 map <int,float> split_level; 01112 int s,e; 01113 cout << "Compute_nodemap"<<endl; 01114 01115 01116 /* for each source word */ 01117 Set e_words=ontology.getAllWords(); 01118 for (sit1 = e_words.begin(); sit1 != e_words.end(); ++sit1){ 01119 e = *sit1; 01120 // cout <<source_id_to_word[e]<<endl; 01121 compute_pTC(e); 01122 e_senses = ontology.getWordSenses(e); 01123 nodemap.clear(); 01124 split_level.clear(); 01125 split_node.clear(); 01126 desc.clear(); 01127 desc.push_back(ROOT_SS_ID); 01128 for(list<int>::iterator lit=desc.begin(); lit != desc.end(); lit++){ 01129 s = *lit; 01130 node = ontology.getSynset(s); 01131 01132 // if the current node is a sense, it is a split node 01133 if(e_senses.find(s)!=e_senses.end() && pSE(s,e)!=0){ 01134 non_null_child=2; 01135 01136 }else{ 01137 non_null_child=0; 01138 01139 for (sit = node->children.begin(); sit != node->children.end(); ++sit){ 01140 if (!ontology.isSynset(*sit))continue; 01141 if (pTC[*sit]==0 && (pSE(*sit,e)==0 || e_senses.find(*sit)==e_senses.end()))continue; 01142 desc.push_back(*sit); 01143 non_null_child++; 01144 } 01145 } 01146 // compute the level of the parent 01147 if(s==ROOT_SS_ID){ 01148 max_level=0; 01149 }else{ 01150 // compute split_level = max_parent(split_level(parent))+1 01151 // get parents 01152 ss_anc.clear(); 01153 max_level =0; 01154 ontology.extractAncestors(node, ss_anc, 1, 1); 01155 for (ssit = ss_anc.begin(); ssit != ss_anc.end(); ++ssit){ 01156 if (split_level[*ssit]>max_level)max_level = split_level[*ssit]; 01157 } 01158 } 01159 if(non_null_child>=2){ 01160 // the node is a split node 01161 split_level[s]=max_level+1.0; 01162 split_node.push_back(make_pair(s,max_level+1.0)); 01163 //if(e_senses.find(s)!=e_senses.end() && pS[s]!=0){ 01164 // split_node.push_back(make_pair(s,max_level+2.0)); 01165 //} 01166 }else{ 01167 // the node is not a split node 01168 split_level[s]=max_level; 01169 //if(e_senses.find(s)!=e_senses.end()){ 01170 // the current node is a sense 01171 // split_node.push_back(make_pair(s,max_level)); 01172 //} 01173 } 01174 //cout <<s<<" " <<split_level[s]<<endl; 01175 } 01176 // Initialize sensemap 01177 for (sit = e_senses.begin(); sit != e_senses.end(); ++sit){ 01178 sensemap[ontology.getSenseKey(e,*sit)]= ontology.getSenseKey(e,*sit); 01179 } 01180 for(list<pair<int,float> >::iterator lit=split_node.begin(); lit != split_node.end(); lit++){ 01181 //cout << lit->first << " " << lit->second<<endl; 01182 if(lit->second==level){ 01183 set_nodemap(lit->first,e); 01184 } 01185 } 01186 } 01187 } 01188 01189 void GraphicalBiText::check_set_pA() 01190 { 01191 // Check wether pA = 1 when it should 01192 real sum_TC; 01193 SetIterator sit,ssit; 01194 Set ss_desc; 01195 Set ss_set=ontology.getAllCategories(); 01196 Node* node,*childnode; 01197 // loop on all synsets 01198 for (sit = ss_set.begin(); sit != ss_set.end(); ++sit){ 01199 // if the node is a pure sens, continue 01200 if(pTC[*sit]==0)continue; 01201 01202 // compute the sum of the node's children probability 01203 sum_TC=0; 01204 node = ontology.getSynset(*sit); 01205 for (ssit = node->children.begin(); ssit != node->children.end(); ++ssit){ 01206 // loop on all the direct descendant of the current synset 01207 //sum_TC+=pTC[*ssit]; 01208 01209 01210 // this node is a both a category and a sens : it is considered as a virtual category 01211 if(pTC[*ssit]!=0 && pS[*ssit]!=0)continue; 01212 01213 childnode = ontology.getSynset(*ssit); 01214 // if a child node is shared between several parents, it contribute proportionnaly to 01215 // it parent probability 01216 01217 sum_TC+=pS[*ssit]/childnode->parents.size(); 01218 } 01219 // if the node is a both a sense and a category, add its virtual sens children 01220 if (pTC[*sit]!=0 && pS[*sit]!=0) sum_TC+=pS[*sit]; 01221 01222 01223 if (sum_TC!=0) pA[*sit]=sum_TC/pTC[*sit]; 01224 01225 // if(sum_TC==0){ 01226 // if(pA[*sit]==0)PLERROR("in check_set_pA : loosing probability mass in node %d : pA was null and forced to 1",*sit); 01227 // pA[*sit]=1; 01228 // }else{ 01229 // if(pA[*sit]==1)PLERROR("in check_set_pA : loosing probability mass in node %d : pTC!=0 but pA==1",*sit); 01230 // //pA[*sit]=sum_TC/pTC[*sit]; 01231 // } 01232 //cout << " pA["<<*sit<<"]="<< pA[*sit]<< " pMC["<<*sit<<"]="<< pMC[*sit]; 01233 01234 } 01235 } 01236 01237 void GraphicalBiText::check_consitency() 01238 { 01239 cout << "Consistency checking :"; 01240 cout << " / pS-1 : "<< sum(pS)-1.0; 01241 cout << " / pSbase-1 : "<< sum(pSbase)-1.0; 01242 cout << " / pMC : "<< sum(pMC); 01243 cout << " / pTC : "<< sum(pTC); 01244 cout << " / pA : "<< sum(pA); 01245 cout << " / pC-1 : "<< sum(pC)-1.0; 01246 cout << " / pF-1 : "<< sum(pF)-1.0; 01247 cout << " / pE-1 : "<< sum(pE)-1.0; 01248 cout << " / pH-1 : " << sum(pH)-1.0; 01249 cout << " / pHupbi-1 : " << sum(pHupbi)-1.0; 01250 cout << " / pFS : "<<pFS.checkCondProbIntegrity(); 01251 cout << " / pES : "<<pES.checkCondProbIntegrity(); 01252 cout << " / pHSupbi : "<<pHSupbi.checkCondProbIntegrity(); 01253 cout << " / pHS : "<<pHS.checkCondProbIntegrity(); 01254 cout << " / pEF-1 : "<<pEF.sumOfElements() - 1.0 <<endl; 01255 01256 } 01257 01258 void GraphicalBiText::print(string name) 01259 { 01260 real proba; 01261 real like_sum=0; 01262 real efs_sum; 01263 int e,f,k,s; 01264 TVec<int> e_senses ; 01265 SetIterator sit; 01266 int e_voc_size = ontology.getVocSize(); 01267 string filename = output_dir+"out_gra"+name; 01268 ofstream out_gra (filename.c_str()); 01269 if (!out_gra.is_open()){ PLERROR("error printing hierarchy");} 01270 01271 01272 ShellProgressBar progress(0,e_voc_size , "e_f_s_probabilities", 50); 01273 progress.init(); 01274 progress.draw(); 01275 01276 Set e_words=ontology.getAllWords(); 01277 for (sit = e_words.begin(); sit != e_words.end(); ++sit){ 01278 e = *sit; 01279 for ( f = 0; f<target_wsd_voc_size;f++){ 01280 e_senses = ontology.getSensesForWord(e); 01281 like_sum+=compute_BN_likelihood(e,f,0,1); 01282 efs_sum=0; 01283 for (k = 0; k < e_senses.size(); k++){ 01284 s = e_senses[k]; 01285 proba = compute_efs_likelihood(e,f,s); 01286 efs_sum+=proba; 01287 out_gra <<target_id_to_word[f] << "\t"<< source_id_to_word[e]<<"\t"<<proba << "\t"<< ontology.getSenseKey(e,s); 01288 // ontology.printSynset(s,out_gra); 01289 } 01290 if (efs_sum-1.0>PROB_PREC)PLERROR("print : efs doesn't sum to 1 for (%d,%d)",e,f); 01291 } 01292 progress.update(e); 01293 } 01294 progress.done(); 01295 cout << " checksum likelihood-1.0 : " <<like_sum-1.0<< endl; 01296 Set ss_set=ontology.getAllCategories(); 01297 //for (sit = ss_set.begin(); sit != ss_set.end(); ++sit){ 01298 // if (sum_epEC[*sit]!=0)cout <<" epEC["<<*sit<<"]="<<sum_epEC[*sit]; 01299 // if (sum_fpFC[*sit]!=0)cout <<" fpFC["<<*sit<<"]="<<sum_epEC[*sit]; 01300 // } 01301 } 01302 void GraphicalBiText::printHierarchy(string name) 01303 { 01304 string filename = "/u/kermorvc/HTML/Treebolic/hierarchy"+name+".xml"; 01305 ofstream out_hie (filename.c_str()); 01306 if (!out_hie.is_open()){ PLERROR("error printing hierarchy");} 01307 01308 out_hie <<"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE treebolic SYSTEM \"Treebolic.dtd\">\n<treebolic toolbar=\"true\" focus-on-hover=\"false\"><tree orientation=\"radial\" expansion=\"0.9\" sweep=\"1.2\" backcolor=\"fffff0\" fontface=\"true\" fontsize=\"20\" fontsizestep=\"2\">\n"; 01309 printNode(ROOT_SS_ID,out_hie); 01310 out_hie <<"<edges></edges></tree></treebolic>"; 01311 01312 } 01313 01314 void GraphicalBiText::printNode(int ss,ostream &out_hie) 01315 { 01316 SetIterator sit,ssit; 01317 Set word_set; 01318 int word; 01319 Node * node = ontology.getSynset(ss); 01320 string color; 01321 if (pTC[ss]==0){ 01322 color="cc0099"; 01323 }else if (pC[ss]==0){ 01324 color="99ffff"; 01325 }else{ 01326 color="0000ff"; 01327 } 01328 out_hie << "<node id=\""<<ss<<"\" backcolor=\""<<color<<"\" forecolor=\"ffffff\">"<<endl; 01329 out_hie <<"<label>"<<node->syns<<"</label>"<<endl; 01330 out_hie<<"<content> pC="<<pC[ss]<<" pMC="<<pMC[ss]<<" pTC=" <<pTC[ss]<<" pA="<<pA[ss]<<" pS="<< pS[ss]<<" ss="<< ss<<node->gloss<<endl; 01331 out_hie <<"</content>"; 01332 01333 // if the node is both sens and category 01334 if(pS[ss]!=0 && pTC[ss]!=0){ 01335 //create a virtual sens node 01336 out_hie << "<node id=\""<<ss<<"\" backcolor=\"ff33cc\" forecolor=\"ffffff\">"<<endl; 01337 out_hie <<"<label>"<<node->syns<<"</label>"<<endl; 01338 out_hie<<"<content> pS="<< pS[ss]<<" ss="<< ss<<node->gloss<<endl; 01339 out_hie <<"</content>"; 01340 } 01341 // Print Word Children 01342 word_set = ontology.getSynsetWordDescendants(ss); 01343 for (ssit = word_set.begin(); ssit != word_set.end(); ++ssit){ 01344 word = *ssit; 01345 if (pES.get(word,ss)!=0){ 01346 out_hie << "<node id=\"w"<<word<<"\" backcolor=\"ff9050\" forecolor=\"ffffff\">"<<endl; 01347 out_hie <<"<label> "<< source_id_to_word[word]<<"</label>"<<endl; 01348 out_hie<<"<content>"<<pES.get(word,ss)<<" id="<<word<<"</content>" <<endl; 01349 out_hie <<"</node>"<<endl; 01350 } 01351 } 01352 01353 // Print Target word children 01354 for (ssit = target_wsd_voc.begin(); ssit != target_wsd_voc.end(); ++ssit){ 01355 word = *ssit; 01356 if(pFS.get(word,ss)!=0){ 01357 01358 out_hie << "<node id=\"w"<<word<<"\" backcolor=\"00EE00\" forecolor=\"ffffff\">"<<endl; 01359 out_hie <<"<label> "<< target_id_to_word[word]<<"</label>"<<endl; 01360 out_hie<<"<content>"<<pFS.get(word,ss)<<" id="<< word <<"</content>" <<endl; 01361 out_hie <<"</node>"<<endl; 01362 } 01363 01364 } 01365 // end of virtual node 01366 if(pS[ss]!=0 && pTC[ss]!=0){out_hie <<" </node>"<<endl; } 01367 01368 for (sit = node->children.begin(); sit != node->children.end(); ++sit){ 01369 // print only branches whose proba is not null 01370 if (pTC[*sit]!=0||pS[*sit]!=0){ 01371 printNode(*sit,out_hie); 01372 } 01373 } 01374 out_hie <<" </node>"<<endl; 01375 } 01376 01377 01378 void GraphicalBiText::update_WSD_model(string name) 01379 { 01380 TVec<int> e_senses; 01381 int i,j,k,h,e,f; 01382 real proba; 01383 int nbsent=0; 01384 nHSupbi.clear(); 01385 pHSupbi.clear(); 01386 //Initialize nHSupbi with nHS 01387 // for (int j = 0; j < nHS.getWidth(); j++){ 01388 // map<int, real>& col_j = nHS.getCol(j); 01389 // for (map<int, real>::iterator it = col_j.begin(); it != col_j.end(); ++it){ 01390 // if( it->second!=0)nHSupbi.set(it->first,j, it->second); 01391 // } 01392 //} 01393 nESupbi.clear(); 01394 //Initialize nESupbi with nESbase 01395 for (int j = 0; j < nESbase.getWidth(); j++){ 01396 map<int, real>& col_j = nESbase.getCol(j); 01397 for (map<int, real>::iterator it = col_j.begin(); it != col_j.end(); ++it){ 01398 if( it->second!=0)nESupbi.set(it->first,j, it->second); 01399 } 01400 } 01401 01402 01403 pHupbi.clear(); 01404 //pHupbi << pHbase; 01405 // "." denotes the end of the sentence 01406 int point_index = source_word_to_id[tostring(".")]; 01407 01408 string filename = output_dir+"/out_bi"+name; 01409 ofstream out_bi (filename.c_str()); 01410 if (!out_bi.is_open()){ PLERROR("error while out_bi");} 01411 ShellProgressBar progress(0, train_bitext_src.size()- 1, "Updating_WSD_model ", 50); 01412 progress.init(); 01413 progress.draw(); 01414 01415 for(i=0;i<train_bitext_src.size()-1;i++){ 01416 e = (int)train_bitext_src[i]; 01417 f = (int)train_bitext_tgt[i]; 01418 if(e==point_index){ 01419 nbsent++; 01420 continue; 01421 } 01422 // if the current word is to be disambiguated 01423 // and its translation is considered 01424 if (ontology.isWord(e)&& target_wsd_voc.find(f)!=target_wsd_voc.end()){ 01425 e_senses = ontology.getSensesForWord(e); 01426 for (k = 0; k < e_senses.size(); k++){ 01427 int s = e_senses[k]; 01428 // Compute P(s|e,f) 01429 proba = compute_efs_likelihood(e,f,s); 01430 01431 if(proba>update_threshold){ 01432 out_bi <<target_id_to_word[f] << "\t"<< source_id_to_word[e]<<"\t"<<sensemap[ontology.getSenseKey(e,s)]<<"\t"<<proba << endl; 01433 if(proba!=0){ 01434 // update context proba forward 01435 for(j=1;j<=window_size;j++){ 01436 h = (int)train_bitext_src[i+j]; 01437 if(h==point_index)break; 01438 //update context proba 01439 pHupbi[h]++; 01440 nHSupbi.incr(h,s,proba); 01441 } 01442 // update context proba backward 01443 for(j=1;j<=window_size;j++){ 01444 h = (int)train_bitext_src[i-j]; 01445 if(h==point_index)break; 01446 //update context proba 01447 pHupbi[h]++; 01448 nHSupbi.incr(h,s,proba); 01449 } 01450 } 01451 } 01452 } 01453 }else{ 01454 out_bi <<target_id_to_word[f] << "\t"<< source_id_to_word[e]<<endl; 01455 } 01456 01457 progress.update(i); 01458 } 01459 progress.done(); 01460 cout<< "Updating WSD model : "<< nbsent<<" sentences processed" <<endl; 01461 // Normalize 01462 pHupbi.smoothNormalize("pHupbi"); 01463 pHSupbi.normalizeCondBackoff(nHSupbi, 0.1,pHupbi,false,false); 01464 pESupbi.normalizeCondBackoff(nESupbi, 0.1,pEbase,false,false); 01465 } 01466 01467 void GraphicalBiText::senseTagBitext(string name) 01468 { 01469 TVec<int> e_senses; 01470 int i,k,ie,e,f; 01471 real proba=0; 01472 int sent_b,sent_e; 01473 // int nbsent=0; 01474 //SetIterator ssit; 01475 01476 // "." denotes the end of the sentence 01477 //int point_index = source_word_to_id[tostring(".")]; 01478 sent_b = 0; 01479 sent_e = 0; 01480 i =0; 01481 01482 // open out file 01483 string filename = output_dir+"out_bi"+name; 01484 ofstream out_bi (filename.c_str()); 01485 if (!out_bi.is_open()){ PLERROR("error while out_bi");} 01486 01487 ShellProgressBar progress(0, train_bitext_src.size()- 1, "SenseTagBitext", 50); 01488 progress.init(); 01489 progress.draw(); 01490 01491 01492 for(i=0;i<train_bitext_src.size();i++){ 01493 e = (int)train_bitext_src[ie]; 01494 f = (int)train_bitext_tgt[ie]; 01495 01496 out_bi <<target_id_to_word[f] << "\t"<< source_id_to_word[e]<<endl; 01497 01498 01499 if (ontology.isWord( (int)train_bitext_src[i])&& target_wsd_voc.find((int)train_bitext_tgt[i])!=target_wsd_voc.end()){ 01500 e_senses = ontology.getSensesForWord((int)train_bitext_src[i]); 01501 for (k = 0; k < e_senses.size(); k++){ 01502 int s = e_senses[k]; 01503 // Compute P(s|e,f) 01504 proba = compute_efs_likelihood(e,f,s); 01505 out_bi <<target_id_to_word[f] << "\t"<< source_id_to_word[e]<<"\t"<<proba << "\t"<< ontology.getSenseKey(e,s)<<"\t"<<s<< endl; 01506 } 01507 } 01508 progress.update(i); 01509 } 01510 progress.done(); 01511 01512 } 01513 01514 01515 void GraphicalBiText::sensetag_valid_bitext(string name) 01516 { 01517 TVec<int> e_senses; 01518 int i,k,maxs,e,f; 01519 real proba=0,ps; 01520 01521 string filename = output_dir+"out_bi"+name; 01522 ofstream out_bi (filename.c_str()); 01523 if (!out_bi.is_open()){ PLERROR("error while out_bi");} 01524 01525 ShellProgressBar progress(0, valid_bitext_src.size()- 1, "Sensetag_valid_bitext ", 50); 01526 progress.init(); 01527 progress.draw(); 01528 01529 for (i=0;i<valid_bitext_src.size();i++){ 01530 e = (int)valid_bitext_src[i]; 01531 f = (int)valid_bitext_tgt[i]; 01532 01533 if (ontology.isWord(e)&& target_wsd_voc.find(f)!=target_wsd_voc.end()){ 01534 maxs = -1; 01535 ps = 0; 01536 e_senses = ontology.getSensesForWord(e); 01537 for (k = 0; k < e_senses.size(); k++){ 01538 int s = e_senses[k]; 01539 // Compute P(s|e,f) 01540 proba = compute_efs_likelihood(e,f,s); 01541 //out_bi <<target_id_to_word[f] << "\t"<< source_id_to_word[e]<<"\t"<<proba << "\t"<< ontology.getSenseKey(e,s); 01542 //ontology.printSynset(s,out_bi); 01543 if (proba>ps){ 01544 ps = proba; 01545 maxs = s; 01546 } 01547 01548 } 01549 out_bi <<target_id_to_word[f] << "\t"<< source_id_to_word[e]<<"\t"<<ps << endl;//"\t"<< ontology.getSenseKey(e,maxs)<<endl; 01550 progress.update(i); 01551 } 01552 } 01553 progress.done(); 01554 } 01555 01556 01557 real GraphicalBiText::compute_efs_likelihood(int e,int f, int se) 01558 { 01559 01560 int s,c; 01561 real pws; 01562 real post; 01563 real like=0; 01564 Vec peC;// value of c node modified src data 01565 Vec pfC;// value of c node modified tgt data 01566 Set ss_anc; 01567 SetIterator sit,ssit; 01568 Set ss_adm; // admissible nodes 01569 set <int>ss_admAnc ; // admissible ancestors 01570 Set synsets; 01571 01572 01573 peC.resize(ss_size); 01574 pfC.resize(ss_size); 01575 01576 // Compute likelihood 01577 synsets=ontology.getAllCategories(); 01578 //synsets = ontology.getWordSenses(e); 01579 for (sit = synsets.begin(); sit != synsets.end(); ++sit){ 01580 s = *sit; 01581 pws = pES.get(e,s); 01582 if (pws!=0){ 01583 ss_anc = ontology.getSynsetAncestors(s); 01584 // if s is not a pure sense add it to its own ancestors 01585 if (pTC[s]!=0){ 01586 ss_anc.insert(s); 01587 } 01588 // loop on all ancesters 01589 for ( ssit = ss_anc.begin(); ssit != ss_anc.end(); ++ssit){// go upward following sense ancestors 01590 c = *ssit; 01591 peC[c]+=pws*pS[s]; 01592 } 01593 } 01594 } 01595 01596 synsets = target_word_to_senses[f]; 01597 for (sit = synsets.begin(); sit != synsets.end(); ++sit){ 01598 s = *sit; 01599 pws = pFS.get(f,s); 01600 if (pws!=0){ 01601 ss_anc = ontology.getSynsetAncestors(s); 01602 // if s is not a pure sense 01603 if (pTC[s]!=0){ 01604 ss_anc.insert(s); 01605 } 01606 01607 // loop on all ancesters 01608 for ( ssit = ss_anc.begin(); ssit != ss_anc.end(); ++ssit){// go upward following sense ancestors 01609 c = *ssit; 01610 pfC[c]+=pws*pS[s]; 01611 } 01612 } 01613 } 01614 ss_adm = commNode(e,f); 01615 for( ssit = ss_adm.begin();ssit != ss_adm.end();++ssit){ 01616 c = *ssit; 01617 //for(i=0;i< commNode(e,f).size();i++){ 01618 //c = commNode(e,f)[i]; 01619 if (peC[c]!=0){ 01620 if (pTC[c]==0){PLERROR("compute_BN_likelihood : division by zero leC/pTC");} 01621 peC[c]/=pTC[c]; 01622 } 01623 } 01624 for( ssit = ss_adm.begin();ssit != ss_adm.end();++ssit){ 01625 c = *ssit; 01626 //for(i=0;i< commNode(e,f).size();i++){ 01627 //c = commNode(e,f)[i]; 01628 if (pfC[c]!=0){ 01629 if (pTC[c]==0){PLERROR("compute_BN_likelihood : division by zero lfC/pTC");} 01630 pfC[c]/=pTC[c]; 01631 } 01632 } 01633 01634 for( ssit = ss_adm.begin();ssit != ss_adm.end();++ssit){ 01635 c = *ssit; 01636 //for(i=0;i< commNode(e,f).size();i++){ 01637 // c = commNode(e,f)[i]; 01638 // cout <<" esf sl "<<c<< " " << peC[c]<<" " <<pfC[c]<<" " <<pC[c]; 01639 like+=peC[c]*pfC[c]*pC[c]; 01640 } 01641 01642 01643 // Compute Posterior P(S=se|E=e,F=f) 01644 post=0; 01645 if (like!=0){ 01646 ss_anc = ontology.getSynsetAncestors(se); 01647 // if se is not a pure sense 01648 if (pTC[se]!=0){ 01649 ss_anc.insert(se); 01650 } 01651 ss_adm = commNode(e,f); 01652 set_intersection(ss_anc.begin(),ss_anc.end(),ss_adm.begin(),ss_adm.end(),inserter( ss_admAnc, ss_admAnc.begin() )); 01653 pws = pES.get(e,se); 01654 if (pws!=0){ 01655 // loop on all admissible ancestors 01656 for ( ssit = ss_admAnc.begin(); ssit != ss_admAnc.end(); ++ssit){// go upward following sense ancestors 01657 c = *ssit; 01658 //for (i=0;i< commNode(e,f).size();i++){ 01659 //c = commNode(e,f)[i]; 01660 if(ss_anc.find(c)==ss_anc.end())continue; 01661 post += pC[c]*pws*pS[se]/pTC[c]*pfC[c]/like; 01662 //cout <<" esf post "<<c<<" pC=" << pC[c]<<" pES="<<pws<<" ps="<<pS[se]<<" pTC="<<pTC[c]<<" pfC=" <<pfC[c]<<" p="<<pC[c]*pws*pS[se]/pTC[c]*pfC[c]/like<<" cum="<<post<<endl; 01663 } 01664 } 01665 } 01666 //cout <<" posterior "<<source_id_to_word[e]<<" "<<target_id_to_word[f]<<" " <<se<<" = "<<post <<" like "<<like<<endl; 01667 return post; 01668 } 01669 void GraphicalBiText::test_WSD(VMat test_set, string name, TVec<string> v,bool select, real interp) 01670 { 01671 01672 01673 int e,s,target,pos,smax,smaxb,smaxs,h; 01674 real nb_supervised=0; 01675 real nb_correct=0; 01676 real nb_single=0; 01677 real nb_unknown=0; 01678 real nb_undef=0; 01679 real nb_correctb=0; 01680 real nb_undefb=0; 01681 real nb_corrects=0; 01682 real nb_correctrandom=0; 01683 real nb_correctwn=0; 01684 real nb_undefs=0; 01685 real max,maxb,maxs,p,pupbi,ps,q,qb; 01686 int nbMap=0; 01687 01688 // Vec for detailed scores 01689 Vec dMatch( source_wsd_voc_size); 01690 Vec dMatchBi(source_wsd_voc_size); 01691 Vec dMatchStup(source_wsd_voc_size); 01692 Vec dNumber(source_wsd_voc_size); 01693 if(!select){ 01694 BiSelect.clear(); 01695 } 01696 if(select)cout <<"WSD_number_BiSelected "<<BiSelect.size()<<endl; 01697 01698 01699 Set source_words; 01700 SetIterator ssit; 01701 01702 string filename; 01703 real context_coeff; 01704 TVec<int> e_senses; 01705 int e_senses_size; 01706 int oov_id = ontology.getWordId(OOV_TAG); 01707 string skey; 01708 int i,j,k; 01709 ShellProgressBar progress; 01710 01711 string diff; 01712 int test_set_size = test_set->length(); 01713 cout << "WSD_"+name+" size = " << test_set_size << endl; 01714 01715 01716 progress.set(0, test_set_size, "Predict "+name+" senses", 50); 01717 progress.init(); 01718 progress.draw(); 01719 #ifdef PRINT_WSD 01720 filename = output_dir+"/out_wsd"+name; 01721 ofstream out_wsd (filename.c_str()); 01722 if (!out_wsd.is_open()){ PLERROR("error while opening out_wsd");} 01723 #endif 01724 Vec row_data; 01725 row_data.resize(n_fields); 01726 for (i = 0; i < test_set_size; i++){ 01727 // get data for one example from test set 01728 test_set->getRow(i, row_data); 01729 if (row_data.size() != n_fields) PLERROR("row_data[%d].size = %d, but n_fields = %d", i, row_data.size(), n_fields); 01730 e = (int)row_data[n_fields-3]; 01731 #ifdef PRINT_WSD 01732 out_wsd <<source_id_to_word[e]<<" "; 01733 #endif 01734 // consider only words in the ontology vocabulary. 01735 if (!ontology.isWord(e))continue; 01736 01737 s = (int) row_data[n_fields-2]; 01738 // map the sense 01739 skey = ontology.getSenseKey(e,s); 01740 if (sensemap_level>0 && s>0 && sensemap.find(skey)!=sensemap.end()){ 01741 nbMap++; 01742 target=ontology.getSynsetIDForSenseKey(e,sensemap[skey]); 01743 //cout << "mapping"<<s <<" "<<skey<<" " << sensemap[skey]<<" " <<ontology.getSynsetIDForSenseKey(e, sensemap[skey])<<endl; 01744 }else{ 01745 target = s; 01746 } 01747 pos = (int)row_data[n_fields-1]; 01748 if (pos!=NOUN_TYPE)continue; 01749 #ifdef PRINT_WSD 01750 out_wsd <<" tar="<<target<<" pos="<<pos<<endl; 01751 #endif 01752 01753 01754 if (target>=0){ 01755 //out_wsd <<source_id_to_word[e] <<" ts="<<target<<" "<<pos; 01756 // Reduce the number of possible senses using POS 01757 if (1){ 01758 switch (pos){ 01759 case NOUN_TYPE: 01760 e_senses = ontology.temp_word_to_noun_senses[e]; 01761 break; 01762 case VERB_TYPE: 01763 e_senses = ontology.temp_word_to_verb_senses[e]; 01764 break; 01765 case ADJ_TYPE: 01766 e_senses = ontology.temp_word_to_adj_senses[e]; 01767 break; 01768 case ADV_TYPE: 01769 e_senses = ontology.temp_word_to_adv_senses[e]; 01770 break; 01771 case UNDEFINED_TYPE: 01772 e_senses = ontology.getSensesForWord(e); 01773 break; 01774 default: 01775 PLERROR("weird in train, pos = %d", pos); 01776 } 01777 } else{ 01778 e_senses = ontology.getSensesForWord(e); 01779 } 01780 e_senses_size = e_senses.size(); 01781 if (e_senses_size==0){ 01782 // trying to disambiguate an unknown word 01783 nb_unknown ++; 01784 v[(int)nb_supervised] = "-1"; 01785 nb_supervised++; 01786 continue; 01787 } 01788 01789 01790 if (e_senses_size==1){ 01791 nb_single++; 01792 v[(int)nb_supervised] = ontology.getSenseKey(e,e_senses[0] ); 01793 dNumber[e]++; 01794 01795 nb_supervised++; 01796 continue; 01797 } 01798 01799 // Real polysemous case 01800 maxb = -FLT_MAX; 01801 max=-FLT_MAX; 01802 maxs=maxb; 01803 smax=-1; 01804 smaxb=-1; 01805 smaxs = smaxb; 01806 01807 for (j = 0; j < e_senses_size; j++){ 01808 int s = e_senses[j]; 01809 p = log(pESbase.get(e,s))+log(pSbase[s]); 01810 pupbi = p; 01811 ps =p; 01812 #ifdef PRINT_WSD 01813 out_wsd << "pES="<<pES.get(e,s)<<" pS="<<pSbase[s];//ontology.printSynset(s,out_wsd); 01814 #endif 01815 if(window_size!=0){ 01816 // Context coefficient : weight the influence of the context 01817 context_coeff = 1.0/(2*window_size); 01818 // consider the context 01819 for (k = 0; k < 2 * window_size; k++){ 01820 h = (int)row_data[3*k]; 01821 #ifdef PRINT_WSD 01822 out_wsd <<"/"<< source_id_to_word[h]; 01823 #endif 01824 if (h==oov_id)continue; 01825 // Default naive bayes 01826 q=pHS.get(h,s); 01827 qb=pHSupbi.get(h,s); 01828 if(qb>1)PLERROR("qb>1 %f",qb); 01829 // if (q!=0 && !isnan(q)){ 01830 p += context_coeff*(log(q)); 01831 //} 01832 pupbi +=context_coeff*(interp*log(qb)+(1.0-interp)*log(q)); 01833 #ifdef PRINT_WSD 01834 out_wsd <<","<<q<<","<<qb; 01835 #endif 01836 } 01837 } 01838 #ifdef PRINT_WSD 01839 out_wsd << " s="<< s <<" p="<<p<<" pupbi="<<pupbi<<endl; 01840 #endif 01841 if (p>max){max=p;smax = s;} 01842 if (pupbi>maxb){maxb=pupbi;smaxb = s;} 01843 if (ps>maxs){maxs=ps;smaxs = s;} 01844 } 01845 //out_wsd <<endl; 01846 01847 // Naive Bayes 01848 if (max==-FLT_MAX){ 01849 nb_undef++; 01850 // No sense predicted : use first in ontology (a kind of more likely a priori) 01851 smax = e_senses[0]; 01852 } 01853 if (target==smax){ 01854 nb_correct++; 01855 dMatch[e]++; 01856 } 01857 01858 // Stupid Bayes 01859 if (maxs==-FLT_MAX){ 01860 nb_undefs++; 01861 smaxs = e_senses[0]; 01862 } 01863 if (target==smaxs){ 01864 nb_corrects++; 01865 dMatchStup[e]++; 01866 } 01867 // StupidWordNet 01868 smaxs = e_senses[0]; 01869 if (target==smaxs){ 01870 nb_correctwn++; 01871 } 01872 // Random 01873 smaxs = e_senses[(int)floor(rand()/(RAND_MAX+1.0)*(float)e_senses.size())]; 01874 01875 // smaxs = floor(bounded_uniform(0,e_senses.size())); 01876 if (target==smaxs){ 01877 nb_correctrandom++; 01878 } 01879 // Bitext 01880 if (maxb==-FLT_MAX){ 01881 nb_undefb++; 01882 // No sense predicted : use first in ontology (a kind of more likely a priori) 01883 smaxb = e_senses[0]; 01884 } 01885 // Use model selection 01886 if (select){ 01887 if(BiSelect.find(e)==BiSelect.end())smaxb = smax; 01888 01889 } 01890 if (target==smaxb){ 01891 nb_correctb++; 01892 dMatchBi[e]++; 01893 } 01894 v[(int)nb_supervised] = ontology.getSenseKey(e, smaxb); 01895 #ifdef PRINT_WSD 01896 out_wsd <<" best " <<source_id_to_word[e]<< " e=" << e <<" tar="<<target<<" hyp="<<smaxb<<" "<< ontology.getSenseKey(e, smaxb)<<endl; 01897 #endif 01898 dNumber[e]++; 01899 nb_supervised++; 01900 01901 } 01902 #ifdef PRINT_WSD 01903 out_wsd << endl; 01904 #endif 01905 progress.update(i); 01906 } 01907 progress.done(); 01908 01909 //#ifdef PRINT_WSD 01910 // open out_answers file 01911 filename = output_dir+"out_score_"+name; 01912 ofstream out_score (filename.c_str()); 01913 if (!out_score.is_open()){ PLERROR("error while opening out_score");} 01914 source_words = ontology.getAllWords(); 01915 for (ssit = source_words.begin(); ssit != source_words.end(); ++ssit){ 01916 e = *ssit; 01917 if (dNumber[e]==0)continue; 01918 if(dMatch[e]<dMatchBi[e]){diff="+";}else{diff="-";} 01919 out_score <<diff<<"\t"<<source_id_to_word[e]<<"\t"<<dNumber[e]<<"\t"<<dMatch[e]<<"\t"<<dMatchBi[e]<<"\t"<<dMatchStup[e]<<endl; 01920 if(!select && dMatch[e]<dMatchBi[e])BiSelect[e]=true; 01921 } 01922 out_score <<"#WSD "<<nbMap<<" mapping done"<<endl; 01923 out_score <<"#WSD "+name+" Random correct :"<<nb_correctrandom<<" / "<<nb_supervised<< " = " << nb_correctrandom/nb_supervised*100 <<endl; 01924 out_score <<"#WSD "+name+" StupidWordNet correct :"<<nb_correctwn<<" / "<<nb_supervised<< " = " << nb_correctwn/nb_supervised*100 <<endl; 01925 out_score <<"#WSD "+name+" StupidBayes correct :"<<nb_corrects<<" / "<<nb_supervised<< " = " << nb_corrects/nb_supervised*100 << " % - " << nb_undefs << " undefined" <<endl; 01926 out_score <<"#WSD "+name+" NaiveBayes correct :"<<nb_correct<<" / "<<nb_supervised<< " = " << nb_correct/nb_supervised*100 << " % - " << nb_undef << " undefined" <<endl; 01927 out_score <<"#WSD "+name+" Bitext correct :"<< nb_correctb<<" / "<<nb_supervised<< " = " << nb_correctb/nb_supervised*100 << " % - " << nb_undefb << " undefined - " <<nb_single<< " single sense words "<< nb_unknown << " unknown words " <<endl; 01928 out_score.close(); 01929 #ifdef PRINT_WSD 01930 out_wsd.close(); 01931 #endif 01932 } 01933 01934 real GraphicalBiText::compute_BN_likelihood(int e,int f, bool update, real nb) 01935 { 01936 // nb is used to update the model with nb times the same observed data 01937 int s,c,se; 01938 real p,pws; 01939 real like=0; 01940 real post,sumpost; 01941 01942 Vec peC; 01943 Vec pfC; 01944 Set ss_anc; 01945 SetIterator sit,ssit; 01946 peC.resize(ss_size); 01947 pfC.resize(ss_size); 01948 Set ss_adm;// admissible nodes 01949 set <int>ss_admAnc ; // admissible ancestors 01950 ss_adm = commNode(e,f); 01951 01952 Set synsets=ontology.getAllCategories(); 01953 // Set synsets = ontology.getWordSenses(e); 01954 for (sit = synsets.begin(); sit != synsets.end(); ++sit){ 01955 s = *sit; 01956 pws = pES.get(e,s); 01957 if (pws!=0){ 01958 ss_anc = ontology.getSynsetAncestors(s); 01959 // if s is not a pure sense add it to its own ancestors 01960 if (pTC[s]!=0){ 01961 ss_anc.insert(s); 01962 } 01963 // loop on all ancesters 01964 for ( ssit = ss_anc.begin(); ssit != ss_anc.end(); ++ssit){// go upward following sense ancestors 01965 c = *ssit; 01966 peC[c]+=pws*pS[s]; 01967 } 01968 } 01969 } 01970 synsets.clear(); 01971 ss_anc.clear(); 01972 synsets = target_word_to_senses[f]; 01973 01974 for (sit = synsets.begin(); sit != synsets.end(); ++sit){ 01975 s = *sit; 01976 pws = pFS.get(f,s); 01977 if (pws!=0){ 01978 ss_anc = ontology.getSynsetAncestors(s); 01979 // if s is not a pure sense add it to its own ancestors 01980 if (pTC[s]!=0){ 01981 ss_anc.insert(s); 01982 } 01983 // loop on all ancesters 01984 for ( ssit = ss_anc.begin(); ssit != ss_anc.end(); ++ssit){// go upward following sense ancestors 01985 c = *ssit; 01986 pfC[c]+=pws*pS[s]; 01987 } 01988 } 01989 } 01990 01991 for( ssit = ss_adm.begin();ssit != ss_adm.end();++ssit){ 01992 c = *ssit; 01993 //for(i=0;i< commNode(e,f).size();i++){ 01994 //c = commNode(e,f)[i]; 01995 if (peC[c]!=0){ 01996 if (pTC[c]==0){PLERROR("compute_BN_likelihood : division by zero leC/pTC");} 01997 peC[c]/=pTC[c]; 01998 } 01999 } 02000 02001 for( ssit = ss_adm.begin();ssit != ss_adm.end();++ssit){ 02002 c = *ssit; 02003 //for(i=0;i< commNode(e,f).size();i++){ 02004 //c = commNode(e,f)[i]; 02005 if (pfC[c]!=0){ 02006 if (pTC[c]==0){PLERROR("compute_BN_likelihood : division by zero lfC/pTC");} 02007 pfC[c]/=pTC[c]; 02008 } 02009 } 02010 02011 for( ssit = ss_adm.begin();ssit != ss_adm.end();++ssit){ 02012 c = *ssit; 02013 //for(i=0;i< commNode(e,f).size();i++){ 02014 //c = commNode(e,f)[i]; 02015 //cout <<" sl "<<c<< " " << peC[c]<<" " <<pfC[c]<<" " <<pC[c]; 02016 like+=peC[c]*pfC[c]*pC[c]; 02017 sum_epEC[c]+=peC[c]; 02018 sum_fpFC[c]+=pfC[c]; 02019 02020 } 02021 // cout <<" like("<<e<<"/"<<source_id_to_word[e]<<","<<f<<"/"<<target_id_to_word[f]<<")="<<like<<endl ; 02022 if(update){ 02023 if (like!=0){ 02024 real chk_up_pes=0; 02025 real chk_up_pfs=0; 02026 real chk_up_pc=0; 02027 real chk_up_ps=0; 02028 // Update pA 02029 02030 for( ssit = ss_adm.begin();ssit != ss_adm.end();++ssit){ 02031 c = *ssit; 02032 //for(i=0;i< commNode(e,f).size();i++){ 02033 //c = commNode(e,f)[i]; 02034 p= peC[c]*pfC[c]*pC[c]/like; 02035 if (p!=0)nA[c]+=nb*p*pA[c]; 02036 chk_up_pc +=nb*p*pA[c]; 02037 } 02038 if (chk_up_pc-nb>PROB_PREC)PLERROR("compute_BN_likelihood : inconsistent update for chk_pc = %f instead of %f",chk_up_pc,nb); 02039 02040 for (sit = synsets.begin(); sit != synsets.end(); ++sit){ 02041 s = *sit; 02042 ss_anc = ontology.getSynsetAncestors(s); 02043 // if s is not a pure sense add it to its own ancestors 02044 if (pTC[s]!=0){ 02045 ss_anc.insert(s); 02046 } 02047 02048 02049 ss_admAnc.clear(); 02050 set_intersection(ss_anc.begin(),ss_anc.end(),ss_adm.begin(),ss_adm.end(),inserter( ss_admAnc, ss_admAnc.begin() )); 02051 02052 // Update pES 02053 pws = pES.get(e,s); 02054 if (pws!=0){ 02055 // loop on all admissible ancestors 02056 for ( ssit = ss_admAnc.begin(); ssit != ss_admAnc.end(); ++ssit){// go upward following sense ancestors 02057 c = *ssit; 02058 //for(i=0;i< commNode(e,f).size();i++){ 02059 //c = commNode(e,f)[i]; 02060 if(ss_anc.find(c)==ss_anc.end())continue; 02061 p = pC[c]*pws*pS[s]/pTC[c]*pfC[c]/like; 02062 02063 if (p!=0){ 02064 nES.incr(e,s,nb*p); 02065 nS[s]+=nb*p; 02066 chk_up_pes+=nb*p; 02067 chk_up_ps+=nb*p; 02068 //cout <<" e ul "<<c<<" pC=" << pC[c]<<" pES="<<pws<<" ps="<<pS[s]<<" pTC="<<pTC[c]<<" pfC=" <<pfC[c]<<" p="<<pC[c]*pws*pS[s]/pTC[c]*pfC[c]/like<<" cum="<< chk_up_pes<<endl; 02069 } 02070 } 02071 } 02072 // Update pFS 02073 pws = pFS.get(f,s); 02074 if (pws!=0){ 02075 // loop on all ancestors 02076 for ( ssit = ss_admAnc.begin(); ssit != ss_admAnc.end(); ++ssit){// go upward following sense ancestors 02077 c = *ssit; 02078 //for(i=0;i< commNode(e,f).size();i++){ 02079 //c = commNode(e,f)[i]; 02080 if(ss_anc.find(c)==ss_anc.end())continue; 02081 p = pC[c]*pws*pS[s]/pTC[c]*peC[c]/like; 02082 if (p!=0){ 02083 nFS.incr(f,s,nb*p); 02084 nS[s]+=nb*p; 02085 //cout <<" f ul "<<c<<" pC=" << pC[c]<<" pFS="<<pws<<" ps="<<pS[s]<<" pTC="<<pTC[c]<<" peC=" <<pfC[c]<<" p="<<pC[c]*pws*pS[s]/pTC[c]*pfC[c]/like<<" cum="<< chk_up_pfs<<endl; 02086 chk_up_pfs+=nb*p; 02087 chk_up_ps+=nb*p; 02088 } 02089 } 02090 } 02091 } 02092 if (chk_up_pfs-nb>PROB_PREC || chk_up_pes-nb>PROB_PREC )PLERROR("compute_BN_likelihood : inconsistent update for chk_pES = %f or chk_pFS = %f instead of %f",chk_up_pes,chk_up_pfs,nb); 02093 if (chk_up_ps-2*nb>PROB_PREC)PLERROR("compute_BN_likelihood : inconsistent update for chk_ps = %f instead of %f",chk_up_ps,nb); 02094 02095 } 02096 } 02097 02098 // Compute Entropy on Bitext 02099 // Compute Posterior P(S=se|E=e,F=f) 02100 02101 sumpost=0; 02102 if (like!=0){ 02103 // For all possibles Senses 02104 Set e_senses = ontology.getWordSenses(e); 02105 for (sit = e_senses.begin(); sit != e_senses.end(); ++sit){ 02106 post=0; 02107 se = *sit; 02108 ss_anc = ontology.getSynsetAncestors(se); 02109 // if se is not a pure sense 02110 if (pTC[se]!=0){ 02111 ss_anc.insert(se); 02112 } 02113 ss_adm = commNode(e,f); 02114 ss_admAnc.clear(); 02115 set_intersection(ss_anc.begin(),ss_anc.end(),ss_adm.begin(),ss_adm.end(),inserter( ss_admAnc, ss_admAnc.begin() )); 02116 pws = pES.get(e,se); 02117 if (pws!=0){ 02118 // loop on all admissible ancestors 02119 for ( ssit = ss_admAnc.begin(); ssit != ss_admAnc.end(); ++ssit){// go upward following sense ancestors 02120 c = *ssit; 02121 //for(i=0;i< commNode(e,f).size();i++){ 02122 //c = commNode(e,f)[i]; 02123 if(ss_anc.find(c)==ss_anc.end())continue; 02124 post += pC[c]*pws*pS[se]/pTC[c]*pfC[c]/like; 02125 //cout <<" esf post "<<c<<" pC=" << pC[c]<<" pES="<<pws<<" ps="<<pS[se]<<" pTC="<<pTC[c]<<" pfC=" <<pfC[c]<<" p="<<pC[c]*pws*pS[se]/pTC[c]*pfC[c]/like<<" cum="<<post<<endl; 02126 } 02127 } 02128 if(post!=0){ 02129 nSEbi.incr(se,e,post); 02130 sumpost+=post; 02131 } 02132 } 02133 if (sumpost-1.0>PROB_PREC)PLERROR("Bitext Entropy computation : sum posterior %f != 1.0",sumpost); 02134 } 02135 02136 return like; 02137 } 02138 02139 void GraphicalBiText::compute_train_likelihood(string name) 02140 { 02141 compute_likelihood(train_bitext_src,train_bitext_tgt,name,1); 02142 } 02143 02144 void GraphicalBiText::compute_valid_likelihood(string name) 02145 { 02146 compute_likelihood(valid_bitext_src,valid_bitext_tgt,name,0); 02147 } 02148 02149 void GraphicalBiText::compute_likelihood( Vec bitext_src, Vec bitext_tgt,string name, bool update) 02150 { 02151 02152 real join_event_number=0; 02153 real indep_event_number=0; 02154 real bn_event_number=0; 02155 real bn_like; 02156 real indep_like; 02157 real join_like; 02158 real join_log_likelihood = 0.0; 02159 real smoothed_join_log_likelihood = 0.0; 02160 real indep_log_likelihood =0.0; 02161 real bn_log_likelihood =0.0; 02162 real smoothed_bn_log_likelihood =0.0; 02163 // update variables 02164 real sum_s,sum_es,sum_fs; 02165 real up_proba; 02166 int i; 02167 int e,f,s,c; 02168 SetIterator sit,ssit; 02169 02170 int nb_trans_pairs=0; 02171 ProbSparseMatrix ef_occur; 02172 real nb_occu; 02173 02174 02175 ef_occur.resize(source_wsd_voc_size,target_wsd_voc_size); 02176 ef_occur.setName("ef_occur");ef_occur.setMode(COLUMN_WISE); 02177 02178 //ofstream out_like ("out_like"); 02179 //if (!out_like.is_open()){ PLERROR("error while opening out_like");} 02180 02181 02182 join_log_likelihood = 0.0; 02183 indep_log_likelihood =0.0; 02184 bn_log_likelihood =0.0; 02185 join_event_number=0; 02186 indep_event_number=0; 02187 bn_event_number=0; 02188 02189 if (update){ 02190 nA.clear(); 02191 nS.clear(); 02192 nES.clear(); 02193 nFS.clear(); 02194 nSEbi.clear(); 02195 } 02196 02197 // since the likelihood depends only on (e,f), it can computed only once for each (e,f) 02198 // the updating and global likelihood depends on each (e,f) likelihood and on the frequency 02199 // of each (e,f) 02200 02201 ShellProgressBar progress(0,bitext_src.size(), "Computing_likelihood_phase1_"+name, 50); 02202 progress.init(); 02203 progress.draw(); 02204 02205 for (i=0;i<bitext_src.size() ;i++){ 02206 e = (int)bitext_src[i]; 02207 f = (int)bitext_tgt[i]; 02208 // Compute likelihod only for words in source_wsd_voc 02209 if(ontology.isWord(e) && target_wsd_voc.find(f)!=target_wsd_voc.end()){ 02210 ef_occur.incr(e,f); 02211 nb_trans_pairs++; 02212 } 02213 progress.update(i); 02214 } 02215 cout << nb_trans_pairs << " translation_pairs_found"<< endl; 02216 progress.done(); 02217 progress.set(0,ef_occur.getWidth(), "Computing_likelihood_phase2_"+name, 50); 02218 progress.init(); 02219 progress.draw(); 02220 02221 for (int f = 0; f< ef_occur.getWidth(); f++){ 02222 map<int, real>& col_j = ef_occur.getCol(f); 02223 for (map<int, real>::iterator it = col_j.begin(); it != col_j.end(); ++it){ 02224 e = (int)it->first; 02225 nb_occu = it->second; 02226 // Compute independant proba 02227 indep_like = pE[e]*pF[f]; 02228 indep_log_likelihood += nb_occu*log(indep_like); 02229 indep_event_number+= nb_occu; 02230 02231 // compute BN likelihood 02232 bn_like= compute_BN_likelihood(e,f,update,nb_occu); 02233 if (bn_like>1.0+PROB_PREC){ PLERROR("Compute_likelihood : BN proba > 1 for %d (%s) %d (%s) ",e,(source_id_to_word[e]).c_str(),f,(target_id_to_word[f]).c_str());} 02234 if (bn_like!=0){ 02235 bn_log_likelihood += nb_occu*log(bn_like); 02236 bn_event_number+=nb_occu; 02237 } 02238 smoothed_bn_log_likelihood +=log(alpha_bn*bn_like+(1-alpha_bn)*indep_like); 02239 02240 // Compute Joint proba 02241 join_like = pEF.get(e,f); 02242 if (join_like!=0){ 02243 join_log_likelihood += nb_occu*log(join_like); 02244 join_event_number+= nb_occu; 02245 } 02246 smoothed_join_log_likelihood +=log(alpha_joint*join_like+(1-alpha_joint)*indep_like); 02247 02248 } 02249 progress.update(f); 02250 } 02251 progress.done(); 02252 02253 02254 cout << name+" indep \t/ ll = " << indep_log_likelihood << " \t/ token = " << indep_event_number << " \t/ smoothed : "<< indep_log_likelihood << " \t/ perp = " << safeexp(-indep_log_likelihood / indep_event_number) << " \t/ smoothed : " <<safeexp(-indep_log_likelihood / indep_event_number)<<endl; 02255 cout << name+" joint \t/ ll = " << join_log_likelihood << " \t/ token = " << join_event_number << " \t/ smoothed : "<< smoothed_join_log_likelihood << " \t/ perp = " << safeexp(-join_log_likelihood /join_event_number ) << " \t/ smoothed : " <<safeexp(-smoothed_join_log_likelihood /indep_event_number )<< endl; 02256 cout << name+" BN \t/ ll = " << bn_log_likelihood << " \t/ token = " << bn_event_number << " \t/ smoothed : " << smoothed_bn_log_likelihood<< " \t/ perp = " << safeexp(-bn_log_likelihood / bn_event_number) << " \t/ smoothed : " <<safeexp(-smoothed_bn_log_likelihood /indep_event_number )<<endl; 02257 02258 02259 if (update){ 02260 progress.set(0, ss_size, "Update_pS_pES_pFS", 50); 02261 progress.init(); 02262 progress.draw(); 02263 02264 // Update parameters 02265 pA.clear(); 02266 pS.clear(); 02267 pES.clear(); 02268 pFS.clear(); 02269 02270 02271 02272 // update pS 02273 sum_s = sum(nS); 02274 //cout << "sum nS :" << sum_s<<endl; 02275 Set synsets=ontology.getAllCategories(); 02276 for (sit = synsets.begin(); sit != synsets.end(); ++sit){ 02277 s = *sit; 02278 if (nS[s]!=0)pS[s]=nS[s]/sum_s;//+clear 02279 sum_es = 0; 02280 Set source_words = ontology.getAllWords(); 02281 for (ssit = source_words.begin(); ssit != source_words.end(); ++ssit){ 02282 e = *ssit; 02283 sum_es += nES.get(e,s); 02284 } 02285 for (ssit = source_words.begin(); ssit != source_words.end(); ++ssit){ 02286 e = *ssit; 02287 up_proba= nES.get(e,s); 02288 if (up_proba!=0){ 02289 pES.set(e,s,up_proba/sum_es); 02290 // cout << " ue " <<up_proba/sum_es ; 02291 } 02292 } 02293 sum_fs=0; 02294 for (ssit = target_wsd_voc.begin(); ssit != target_wsd_voc.end(); ++ssit){ 02295 f = *ssit; 02296 sum_fs += nFS.get(f,s); 02297 } 02298 for (ssit = target_wsd_voc.begin(); ssit != target_wsd_voc.end(); ++ssit){ 02299 f = *ssit; 02300 up_proba = nFS.get(f,s); 02301 if (up_proba!=0){ 02302 //cout << " uf "<<up_proba/sum_fs; 02303 pFS.set(f,s,up_proba/sum_fs); 02304 } 02305 } 02306 progress.update(s); 02307 } 02308 compute_pTC(); 02309 02310 // Update pA 02311 synsets=ontology.getAllCategories(); 02312 for (sit = synsets.begin(); sit != synsets.end(); ++sit){ 02313 c = *sit; 02314 if(nA[c]!=0){ 02315 pA[c]=nA[c]/bn_event_number; 02316 } 02317 } 02318 compute_pTC(); 02319 check_set_pA(); 02320 compute_pMC(); 02321 02322 02323 progress.done(); 02324 } 02325 pSEbi.clear(); 02326 // Entropy computation 02327 pSEbi.normalizeCond(nSEbi, false); 02328 02329 } 02330 02331 void GraphicalBiText::computeKL() 02332 { 02333 int e; 02334 SetIterator sit; 02335 Set e_words=ontology.getAllWords(); 02336 real kl,skl; 02337 for (sit = e_words.begin(); sit != e_words.end(); ++sit){ 02338 e = *sit; 02339 kl=0; 02340 if ( pSEbi.sumCol(e)==0 || pSE.sumCol(e)==0)continue; 02341 map<int, real>& col_e = pSE.getCol(e); 02342 //cout <<"KL\t"<<source_id_to_word[e]; 02343 for (map<int, real>::iterator mit = col_e.begin(); mit != col_e.end(); ++mit){ 02344 //cout << " e="<<e<<" s="<<mit->first<<" bi="<<pSEbi.get(mit->first,e)<<" ref="<<mit->second; 02345 skl=pSEbi.get(mit->first,e)*safeflog2(pSEbi.get(mit->first,e)/mit->second); 02346 if (!isnan(skl))kl+=skl; 02347 } 02348 //cout << "\t"<<kl<<endl; 02349 KL[e]=kl; 02350 } 02351 } 02352 02353 02354 02355 void GraphicalBiText::loadSensemap(string sensemap_file) 02356 { 02357 int nbMap=0; 02358 // Load sensemap file 02359 cout << "Loading sensemap : "; 02360 ifstream sensemap_stream(sensemap_file.c_str()); 02361 string line; 02362 vector<string> tokens; 02363 if(sensemap_stream.is_open()){ 02364 while(!sensemap_stream.eof()){ 02365 line = pgetline(sensemap_stream); 02366 if (line=="") continue; 02367 tokens = split(line, " "); 02368 if (tokens.size()>1){ 02369 nbMap++; 02370 sensemap[tokens[0]]=tokens[2]; 02371 }else{ 02372 sensemap[tokens[0]]=tokens[0]; 02373 } 02374 } 02375 } 02376 cout << nbMap << " sense mappings found\n"; 02377 // for(map<string,string>::iterator mit=sensemap.begin();mit!=sensemap.end();mit++)cout << mit->first << " -> "<<mit->second<<endl; 02378 } 02379 02380 void GraphicalBiText::train(VMat training_set) 02381 { 02382 02383 02384 TVec<string> our_answers1(wsd_train.length()); 02385 02386 real interp_max = 1; 02387 real interp_min = 1; 02388 real interp_step = 0.4; 02389 // Bi.print("0"); 02390 //printHierarchy("0"); 02391 //Bi.computeKL(); 02392 for(real interp=interp_min;interp<=interp_max;interp+=interp_step){ 02393 test_WSD(wsd_train, "Semcor_train_set_epoch_0_"+tostring(interp), our_answers1,0,interp); 02394 test_WSD(wsd_valid, "Semcor_valid1_set_epoch_0_"+tostring(interp), our_answers1,0,interp); 02395 test_WSD(wsd_valid2,"Semcor_valid2_set_epoch_0_"+tostring(interp),our_answers1,0,interp); 02396 test_WSD(wsd_test,"Semcor_test_set_epoch_0_"+tostring(interp),our_answers1,0,interp); 02397 // test_WSD(senseval2_train,"Senseval2_trainset_epoch_0_"+tostring(interp),our_answers1,0,interp); 02398 } 02399 // Bi.test_WSD(senseval_test, "Senseval_test_set_epoch_0", our_answers,1); 02400 //string out_name = "out_answer_0"; ofstream out_answer (out_name.c_str()); if (!out_answer.is_open()){ PLERROR("error while opening out_answer");}int k=0;for(int i=0; i<our_answers.size(); i++){ string::size_type pos = headers[i].find_first_of("."); out_answer << headers[i].substr(0,pos) << " " << headers[i] << " " << our_answers[k] << endl; k++; }out_answer.close(); 02401 02402 02403 for (int i=1;i<n_epoch;i++){ 02404 compute_train_likelihood("Train_set_epoc "+tostring(i)); 02405 // Bi.computeKL(); 02406 compute_valid_likelihood("Valid_set_epoc "+tostring(i)); 02407 //optimize_interp_parameter(test_tgt,test_src, "Opt valid"); 02408 02409 update_WSD_model(tostring(i)); 02410 check_consitency(); 02411 //Bi.print(tostring(i)); 02412 // printHierarchy(tostring(i)); 02413 for(real interp=interp_min;interp<=interp_max;interp+=interp_step){ 02414 test_WSD(wsd_train, "Semcor_train_set_epoch_"+tostring(i)+"_"+tostring(interp), our_answers1,0,interp); 02415 test_WSD(wsd_valid, "Semcor_valid1_set_epoch_"+tostring(i)+"_"+tostring(interp), our_answers1,0,interp); 02416 test_WSD(wsd_valid2, "Semcor_valid2_set_epoch_"+tostring(i)+"_"+tostring(interp), our_answers1,0,interp); 02417 test_WSD(wsd_test,"Semcor_test_set_epoch_"+tostring(i)+"_"+tostring(interp),our_answers1,0,interp); 02418 // test_WSD(senseval2_train,"Senseval2_train_set_epoch_"+tostring(i)+"_"+tostring(interp),our_answers1,0,interp); 02419 } 02420 02421 // Bi.test_WSD(senseval_test, "Senseval_test_set_epoch_"+tostring(i), 02422 // our_answers,1);out_name = "out_answer_"+tostring(i);ofstream out_answer (out_name.c_str());if (!out_answer.is_open()){ PLERROR("error while opening out_answer");}int k=0;for(int j=0; j<our_answers.size(); j++){string::size_type pos = headers[j].find_first_of(".");out_answer << headers[j].substr(0,pos) << " " << headers[j] << " " << our_answers[k] << endl;k++;}out_answer.close(); 02423 } 02424 02425 } 02426 02427 void GraphicalBiText::test() 02428 { 02429 02430 } 02431 02432 void GraphicalBiText::setTrainingSet(VMat training_set, bool call_forget) 02433 { 02434 02435 } 02436 02437 02438 } // end of namespace PLearn 02439

Generated on Tue Aug 17 15:54:54 2004 for PLearn by doxygen 1.3.7