00001
#include "GraphicalBiText.h"
00002
00003
00004
00005
namespace PLearn {
00006
using namespace std;
00007
00008
PLEARN_IMPLEMENT_OBJECT(GraphicalBiText,
"Probabilistically tag a bitext (english-other language) with senses from WordNet",
"NO HELP");
00009
00010 GraphicalBiText::GraphicalBiText()
00011 :
00012 window_size(3),
00013 n_epoch(5),
00014 source_path("/u/larocheh/myUserExp/WSD/features/world3"),
00015 semcor_train_path("/u/kermorvc/Data/Semcor/semcor1.7/1/train_corpus_all_wn17"),
00016 semcor_valid_path("/u/kermorvc/Data/Semcor/semcor1.7/1/valid1_corpus_all_wn17"),
00017 semcor_valid2_path("/u/kermorvc/Data/Semcor/semcor1.7/1/valid2_corpus_all_wn17"),
00018 semcor_test_path("/u/kermorvc/Data/Semcor/semcor1.7/1/test_corpus_all_wn17"),
00019 senseval2_train_path("/u/kermorvc/Data/Senseval/english-lex-
sample/train/eng-lex_world3"),
00020 update_threshold(0),
00021 output_dir("./")
00022 {
00023
00024 }
00025
00026 GraphicalBiText::~GraphicalBiText()
00027 {
00028
00029 }
00030
00031 void GraphicalBiText::declareOptions(
OptionList& ol)
00032 {
00033
declareOption(ol,
"window_size", & GraphicalBiText::window_size, OptionBase::buildoption,
00034
" size of the context window for disambiguation (same on the rigth side and on the left side)\n");
00035
declareOption(ol,
"n_epoch", & GraphicalBiText::n_epoch, OptionBase::buildoption,
00036
" number of iterations in the EM learning algorithm\n");
00037
declareOption(ol,
"source_path", & GraphicalBiText::source_path, OptionBase::buildoption,
00038
" path to the ontology\n");
00039
declareOption(ol,
"source_voc", & GraphicalBiText::source_voc, OptionBase::buildoption,
00040
" path to the source language vocabulary\n");
00041
declareOption(ol,
"target_voc", & GraphicalBiText::target_voc, OptionBase::buildoption,
00042
" path to the target language vocabulary\n");
00043
declareOption(ol,
"train_file", & GraphicalBiText::train_file, OptionBase::buildoption,
00044
" path to the bitext training file\n");
00045
declareOption(ol,
"valid_file", & GraphicalBiText::valid_file, OptionBase::buildoption,
00046
" path to the bitext validation file\n");
00047
declareOption(ol,
"sensemap_file", & GraphicalBiText::sensemap_file, OptionBase::buildoption,
00048
" path to the sensemap file for coarse senses\n");
00049
declareOption(ol,
"sensemap_level", & GraphicalBiText::sensemap_level, OptionBase::buildoption,
00050
" level of sense grouping 1=all grouped 99 = all separated\n");
00051
declareOption(ol,
"semcor_train_path", & GraphicalBiText::semcor_train_path, OptionBase::buildoption,
00052
" path to the semcor training VMat file\n");
00053
declareOption(ol,
"semcor_valid_path", & GraphicalBiText::semcor_valid_path, OptionBase::buildoption,
00054
" path to the semcor validation VMat file\n");
00055
declareOption(ol,
"semcor_valid2_path", & GraphicalBiText::semcor_valid2_path, OptionBase::buildoption,
00056
" path to a second semcor validation VMat file\n");
00057
declareOption(ol,
"semcor_test_path", & GraphicalBiText::semcor_test_path, OptionBase::buildoption,
00058
" path to the semcor testing VMat file\n");
00059
declareOption(ol,
"update_threshold", & GraphicalBiText::update_threshold, OptionBase::buildoption,
00060
" p(s|e,f) threshold above which the bitext data is used\n");
00061
declareOption(ol,
"output_dir", & GraphicalBiText::output_dir, OptionBase::buildoption,
00062
" dir for all outputs\n");
00063
00064 }
00065
00066 void GraphicalBiText::build()
00067 {
00068 inherited::build();
00069
build_();
00070
00071 }
00072
00073 void GraphicalBiText::build_()
00074 {
00075
00076
00077
string line;
00078
vector<string> tokens;
00079
string word;
00080
int id;
00081
SetIterator sit;
00082
00083
00084
alpha_bn=
INIT_ALPHA;
00085
alpha_joint =
INIT_ALPHA;
00086
00087
00088
string wsd_voc =
source_path+
".voc";
00089
string synset_file = source_path+
".synsets";
00090
string ontology_file = source_path+
".ontology";
00091
string sense_key_file = source_path+
".sense_key";
00092
00093
ontology =
WordNetOntology(wsd_voc, synset_file, ontology_file, sense_key_file,
false,
false);
00094
00095
00096
ontology.
fillTempWordToSensesTVecMap();
00097
00098
00099
00100
00101
00102
source_wsd_voc_size =
ontology.
getVocSize();
00103
source_voc_size =
ontology.
getVocSize();
00104
sense_size =
ontology.
getSenseSize();
00105
ss_size =
ontology.
getMaxSynsetId() + 1;
00106 cout <<
"|original ontology voc| = " <<
source_wsd_voc_size <<
endl;
00107 cout <<
"|original sense| = " <<
sense_size <<
endl;
00108 cout <<
"|original synset| = " <<
ss_size <<
endl;
00109
00110
00111
00112
Set all_words =
ontology.
getAllWords();
00113
for (sit = all_words.
begin(); sit != all_words.
end(); ++sit){
00114 word =
ontology.
getWord(*sit);
00115
id =
ontology.
getWordId(word);
00116
source_word_to_id[word]=
id;
00117
source_id_to_word[
id]=word;
00118 }
00119
00120 cout <<
"| source voc | = "<<
source_word_to_id.size()<<
endl;
00121
00122
00123 ifstream if_voc(
source_voc.c_str());
00124
if (!if_voc)
PLERROR(
"can't open %s",
source_voc.c_str());
00125
Set words_to_be_kept;
00126
int oov_id =
ontology.
getWordId(
OOV_TAG);
00127
00128
int wn_id;
00129
while (!if_voc.eof()){
00130 getline(if_voc, line,
'\n');
00131
if (line ==
"")
continue;
00132
if (line[0] ==
'#' && line[1] ==
'#')
continue;
00133 tokens =
split(line,
" ");
00134
if (tokens.size() != 1)
PLERROR(
"target vocabulary file format error (line = '%s')", line.c_str());
00135 wn_id =
ontology.
getWordId(
tostring(tokens[0]));
00136
if (wn_id==oov_id &&
tostring(tokens[0])!=
OOV_TAG){
00137
PLWARNING(
"word to disambiguate is not in the ontology %s", line.c_str());
00138 }
else{
00139 words_to_be_kept.
insert(wn_id);
00140 }
00141 }
00142 if_voc.close();
00143
00144
for (sit = all_words.
begin(); sit != all_words.
end(); ++sit){
00145
if( words_to_be_kept.
find(*sit)== words_to_be_kept.
end()){
00146
00147
ontology.
removeWord(*sit);
00148 }
00149 }
00150
ontology.
removeNonReachableSynsets();
00151
00152 cout <<
"|pruned ontology voc| = " <<
ontology.
getVocSize()<<
endl;
00153 cout <<
"|pruned sense| = " <<
ontology.
getSenseSize() <<
endl;
00154 cout <<
"|pruned synset| = " <<
ontology.
getMaxSynsetId() + 1 <<
endl;
00155
00156
00157
00158
00159 ifstream if_tvoc(
target_voc.c_str());
00160
if (!if_tvoc)
PLERROR(
"can't open %s",
target_voc.c_str());
00161
int next_id=0;
00162
while (!if_tvoc.eof()) {
00163 getline(if_tvoc, line,
'\n');
00164
if (line ==
"")
continue;
00165
if (line[0] ==
'#' && line[1] ==
'#')
continue;
00166 tokens =
split(line,
" ");
00167
if (tokens.size() != 1)
PLERROR(
"target vocabulary file format error (line = '%s')", line.c_str());
00168
target_id_to_word[next_id]=
tostring(tokens[0]);
00169
target_word_to_id[
tostring(tokens[0])]=next_id;
00170
target_wsd_voc.
insert(next_id);
00171 next_id++;
00172 }
00173 if_tvoc.close();
00174
00175
if (
target_word_to_id.find(
OOV_TAG)==
target_word_to_id.end()){
00176
target_word_to_id[
OOV_TAG]=next_id;
00177 cout <<
" add OOV to target vocabulary " <<
endl;
00178 next_id++;
00179 }
00180
target_wsd_voc_size =
target_wsd_voc.
size();
00181
target_voc_size =
target_wsd_voc_size;
00182 cout <<
"|WSD target voc| = " <<target_wsd_voc_size<<
endl;
00183
00184
00185
loadBitext(
train_file,
valid_file,0);
00186 cout <<
"|target voc| = " <<
target_voc_size<<
endl;
00187 cout <<
"|source voc| = " <<
source_voc_size<<
endl;
00188
00189
00190
00191
commNode.
resize(
source_wsd_voc_size,target_wsd_voc_size);
00192
00193
00194
00195
00196
00197
00198
sum_epEC.
resize(
ss_size);
00199
sum_fpFC.
resize(
ss_size);
00200
sum_cpC.
resize(
ss_size);
00201
00202
pMC.
resize(
ss_size);
00203
pC.
resize(
ss_size);
00204
pTC.
resize(
ss_size);
00205
pA.
resize(
ss_size);
00206
nA.
resize(
ss_size);
00207
00208
00209
pS.
resize(
ss_size);
00210
pSbase.
resize(
ss_size);
00211
pSupbi.
resize(
ss_size);
00212
nS.
resize(
ss_size);
00213
00214
pEbase.
resize(
source_wsd_voc_size);
00215
pE.
resize(
source_wsd_voc_size);
00216
00217
pH.
resize(
source_voc_size);
00218
pHbase.
resize(
source_voc_size);
00219
pHupbi.
resize(
source_voc_size);
00220
00221
pF.
resize(target_wsd_voc_size);
00222
00223
00224
00225
nFS.
resize(target_wsd_voc_size,
ss_size);
nFS.
setName(
"nFS");
nFS.
setMode(
COLUMN_WISE);
00226
pFS.
resize(target_wsd_voc_size,
ss_size);
pFS.
setName(
"pFS");
pFS.
setMode(
COLUMN_WISE);
00227
00228
00229
nES.
resize(
source_wsd_voc_size,
ss_size);
nES.
setName(
"nES");
nES.
setMode(
COLUMN_WISE);
00230
pES.
resize(
source_wsd_voc_size,
ss_size);
pES.
setName(
"pES");
pES.
setMode(
COLUMN_WISE);
00231
00232
00233
00234
nSE.
resize(
ss_size,
source_wsd_voc_size);
nSE.
setName(
"nSE");
nSE.
setMode(
COLUMN_WISE);
00235
pSE.
resize(
ss_size,
source_wsd_voc_size);
pSE.
setName(
"pSE");
pSE.
setMode(
COLUMN_WISE);
00236
nSEbi.
resize(
ss_size,
source_wsd_voc_size);
nSEbi.
setName(
"nSE");
nSEbi.
setMode(
COLUMN_WISE);
00237
pSEbi.
resize(
ss_size,
source_wsd_voc_size);
pSEbi.
setName(
"pSE");
pSEbi.
setMode(
COLUMN_WISE);
00238
KL.
resize(
source_wsd_voc_size);
00239
BiSelect.clear();
00240
00241
00242
pESbase.
resize(
source_wsd_voc_size,
ss_size);
pESbase.
setName(
"pESbase");
pESbase.
setMode(
COLUMN_WISE);
00243
pESupbi.
resize(
source_wsd_voc_size,
ss_size);
pESupbi.
setName(
"pESupbi");
pESupbi.
setMode(
COLUMN_WISE);
00244
nESbase.
resize(
source_wsd_voc_size,
ss_size);
nESbase.
setName(
"nESbase");
nESbase.
setMode(
COLUMN_WISE);
00245
nESupbi.
resize(
source_wsd_voc_size,
ss_size);
nESupbi.
setName(
"nESupbi");
nESupbi.
setMode(
COLUMN_WISE);
00246
00247
00248
nHS.
resize(
source_voc_size,
ss_size);
nHS.
setName(
"nHS");
nHS.
setMode(
COLUMN_WISE);
00249
pHS.
resize(
source_voc_size,
ss_size);
pHS.
setName(
"pHS");
pHS.
setMode(
COLUMN_WISE);
00250
nHSupbi.
resize(
source_voc_size,
ss_size);
nHSupbi.
setName(
"nHSupbi");
nHSupbi.
setMode(
COLUMN_WISE);
00251
pHSupbi.
resize(
source_voc_size,
ss_size);
pHSupbi.
setName(
"pHSupbi");
pHSupbi.
setMode(
COLUMN_WISE);
00252
00253
pEF.
resize(
source_wsd_voc_size,target_wsd_voc_size);
pEF.
setMode(
COLUMN_WISE);
00254
nEF.
resize(
source_wsd_voc_size,target_wsd_voc_size);
nEF.
setMode(
COLUMN_WISE);
00255
00256
00257
init();
00258
init_WSD();
00259 }
00260
00261 void GraphicalBiText::loadBitext(
string train_file_name,
string valid_file_name,
bool update_voc)
00262 {
00263 ifstream ifs1,ifs2;
00264
int if1_nb_lines=0;
00265
int if2_nb_lines=0;
00266
00267
int nb_line;
00268
string line;
00269
vector<string> tokens;
00270
ShellProgressBar progress;
00271
string src_word,src_stem_word,tgt_word;
00272
int src_word_id,tgt_word_id;
00273
int tgt_next_id =
target_voc_size;
00274
00275
00276
00277 ifs1.open(train_file_name.c_str());
00278
if (!ifs1)
PLERROR(
"load_bitext : can't open %s", train_file_name.c_str());
00279 if1_nb_lines = ShellProgressBar::getAsciiFileLineCount(train_file_name);
00280
train_bitext_tgt.
resize(if1_nb_lines);
00281
train_bitext_src.
resize(if1_nb_lines);
00282
00283
00284 ifs2.open(valid_file_name.c_str());
00285
if (!ifs2)
PLERROR(
"load_bitext : can't open %s", valid_file_name.c_str());
00286 if2_nb_lines = ShellProgressBar::getAsciiFileLineCount(valid_file_name);
00287
valid_bitext_tgt.
resize(if2_nb_lines);
00288
valid_bitext_src.
resize(if2_nb_lines);
00289
00290
00291 progress.
set(0, if1_nb_lines,
"Loading "+train_file_name, 50);
00292 progress.
init();
00293 progress.
draw();
00294 nb_line = 0;
00295
while (!ifs1.eof()) {
00296 getline(ifs1, line,
'\n');
00297
if (line ==
"")
continue;
00298
if (line[0] ==
'#' && line[1] ==
'#')
continue;
00299 tokens =
split(line,
" ");
00300
00301
if (tokens.size() != 2)
PLERROR(
"format error : file %s (line = '%s')", train_file_name.c_str(),line.c_str());
00302 tgt_word =
tostring(tokens[0]);
00303 src_word =
tostring(tokens[1]);
00304
if(update_voc){
00305
if (
target_word_to_id.find(tgt_word) ==
target_word_to_id.end()){
00306
target_id_to_word[tgt_next_id]=tgt_word;
00307
target_word_to_id[tgt_word]=tgt_next_id;
00308 tgt_word_id = tgt_next_id;
00309 tgt_next_id++;
00310 }
else{
00311 tgt_word_id=
target_word_to_id[tgt_word];
00312 }
00313
if (
source_word_to_id.find(src_word) ==
source_word_to_id.end()){
00314
00315
00316
00317
00318
00319 src_word_id=
source_word_to_id[
OOV_TAG];
00320 }
else{
00321 src_word_id=
source_word_to_id[src_word];
00322 }
00323 }
else{
00324
if (
target_word_to_id.find(tgt_word) ==
target_word_to_id.end()){
00325 tgt_word_id=
target_word_to_id[
OOV_TAG];
00326 }
else{
00327 tgt_word_id=
target_word_to_id[tgt_word];
00328 }
00329
if (
source_word_to_id.find(src_word) ==
source_word_to_id.end()){
00330 src_word_id=
source_word_to_id[
OOV_TAG];
00331 }
else
00332 src_word_id=
source_word_to_id[src_word];
00333 }
00334
train_bitext_tgt[nb_line]=tgt_word_id;
00335
train_bitext_src[nb_line]=src_word_id;
00336 nb_line++;
00337 progress.
update(nb_line);
00338 }
00339 progress.
done();
00340
if (update_voc){
00341 target_voc_size = tgt_next_id;
00342 }
00343
00344
00345 update_voc =
false;
00346
00347
00348 progress.
set(0, if2_nb_lines,
"Loading "+valid_file_name, 50);
00349 progress.
init();
00350 progress.
draw();
00351 nb_line = 0;
00352
while (!ifs2.eof()) {
00353 getline(ifs2, line,
'\n');
00354
if (line ==
"")
continue;
00355
if (line[0] ==
'#' && line[1] ==
'#')
continue;
00356 tokens =
split(line,
" ");
00357
00358
if (tokens.size() != 2)
PLERROR(
"format error : file %s (line = '%s')", valid_file_name.c_str(),line.c_str());
00359 tgt_word =
tostring(tokens[0]);
00360 src_word =
tostring(tokens[1]);
00361
if(update_voc){
00362
if (
target_word_to_id.find(tgt_word) ==
target_word_to_id.end()){
00363
target_id_to_word[tgt_next_id]=tgt_word;
00364
target_word_to_id[tgt_word]=tgt_next_id;
00365 tgt_word_id = tgt_next_id;
00366 tgt_next_id++;
00367 }
else{
00368 tgt_word_id=
target_word_to_id[tgt_word];
00369 }
00370
if (
source_word_to_id.find(src_word) ==
source_word_to_id.end()){
00371
00372
00373
00374
00375
00376 src_word_id=
source_word_to_id[
OOV_TAG];
00377 }
else{
00378 src_word_id=
source_word_to_id[src_word];
00379 }
00380 }
else{
00381
if (
target_word_to_id.find(tgt_word) ==
target_word_to_id.end()){
00382 tgt_word_id=
target_word_to_id[
OOV_TAG];
00383 }
else{
00384 tgt_word_id=
target_word_to_id[tgt_word];
00385 }
00386
if (
source_word_to_id.find(src_word) ==
source_word_to_id.end()){
00387 src_word_id=
source_word_to_id[
OOV_TAG];
00388 }
else
00389 src_word_id=
source_word_to_id[src_word];
00390 }
00391
valid_bitext_tgt[nb_line]=tgt_word_id;
00392
valid_bitext_src[nb_line]=src_word_id;
00393 nb_line++;
00394 progress.
update(nb_line);
00395 }
00396
if (update_voc){
00397 target_voc_size = tgt_next_id;
00398 }
00399 progress.
done();
00400
00401
00402
00403 }
00404
00405 void GraphicalBiText::init_WSD()
00406 {
00407
int i,e,s,si,pos,
k,h;
00408
string skey;
00409
n_fields = 6 *
window_size+3;
00410
int oov_id =
ontology.
getWordId(
OOV_TAG);
00411
Vec row_data;
00412 row_data.
resize(
n_fields);
00413
for (i = 0; i <
wsd_train.
length(); i++){
00414
wsd_train->getRow(i, row_data);
00415 e = (
int)row_data[
n_fields-3];
00416 si = (
int) row_data[
n_fields-2];
00417
00418 s = si;
00419 skey =
ontology.
getSenseKey(e,si);
00420
if (si>0 &&
sensemap.find(skey)!=
sensemap.end())s=
ontology.
getSynsetIDForSenseKey(e,
sensemap[skey]);
00421 pos = (
int)row_data[
n_fields-1];
00422
00423
00424
if (e<0 || e == oov_id)
continue;
00425
if (s>0 &&
ontology.
isWord(e)&&
ontology.
isSense(s)){
00426
if (pos!=
NOUN_TYPE)
continue;
00427
00428
00429
nESbase.
incr(e,s);
00430
pSbase[s]++;
00431
pEbase[e]++;
00432
00433
if(window_size!=0){
00434
00435
for (
k = 0;
k < 2 * window_size;
k++){
00436 h = (
int)row_data[3*
k];
00437
if (h<0 || h == oov_id)
continue;
00438
nHS.
incr(h,s);
00439
pH[h]++;
00440
pHbase[h]++;
00441 }
00442 }
00443 }
00444 }
00445
00446
00447
pEbase.
smoothNormalize(
"pEbase");
00448
pESbase.
normalizeCondBackoff(
nESbase,0.1,
pEbase,
false,
false);
00449
pESupbi.
normalizeCondBackoff(
nESbase,0.1,
pEbase,
false,
false);
00450
pSbase.
smoothNormalize(
"pSbase",0.1);
00451
00452
00453
if(window_size!=0){
00454
pH.
smoothNormalize(
"pH");
00455
00456
pHS.
normalizeCondBackoff(
nHS, 0.1,
pH,
false,
false);
00457
pHSupbi.
normalizeCondBackoff(
nHS,0.1,
pH,
false,
false);
00458
00459 }
00460
00461 }
00462
00463 VMat loadToVMat(
string file,
string name,
int window,
int n_examples)
00464 {
00465
00466
VMat dvm =
new DiskVMatrix(file);
00467
00468
VMat sub_dvm =
new SubVMatrix(dvm, 0, 0, (n_examples < 0 ? dvm->length() : n_examples) , dvm->
width());
00469
00470
Mat m(sub_dvm.
length(), sub_dvm.
width());
00471
ShellProgressBar progress(0, m.
length()-1,
"Loading "+name, 50);
00472 progress.
draw();
00473
for(
int i=0; i<m.
length(); i++){
00474 sub_dvm->getRow(i,m(i));
00475 progress.
update(i);
00476 }
00477 progress.
done();
00478 cout << m.
length() <<
" lines found"<<
endl;
00479
00480
VMat vm(m);
00481
VMat tvm =
new TextSenseSequenceVMatrix(vm, 2*window);
00482
return tvm;
00483 }
00484
00485
00486 void GraphicalBiText::init()
00487 {
00488
00489
n_fields = 6 *
window_size+3;
00490
string skey;
00491
string line;
00492
vector<string> tokens;
00493
string src_word,src_stem_word,tgt_word;
00494
int src_word_id,tgt_word_id;
00495
int c,s,si,e,f,i,j,pos;
00496 map <int,int> nb_translation;
00497
int oov_id =
ontology.
getWordId(
OOV_TAG);
00498
int nbMap=0;
00499
Set src_senses,ss_anc;
00500
SetIterator sit;
00501
Vec row_data;
00502 row_data.
resize(
n_fields);
00503
ShellProgressBar progress;
00504
float maxp=0;
00505
float p;
00506
int maxs=0;
00507
pES.
clear();
00508
00509
00510
00511
wsd_train =
loadToVMat (
semcor_train_path,
"Semcor_train",window_size,-1);
00512
00513
wsd_valid =
loadToVMat (
semcor_valid_path,
"Semcor_valid",window_size,-1);
00514
wsd_valid2 =
loadToVMat (
semcor_valid2_path,
"Semcor_valid2",window_size,-1);
00515
wsd_test =
loadToVMat (
semcor_test_path,
"Semcor_test",window_size,-1);
00516
00517
00518
senseval2_train =
loadToVMat (
senseval2_train_path,
"Senseval_train",window_size,-1);
00519
00520 TVec < set<int> > f_possible_senses(
target_wsd_voc_size);
00521
Vec symscore(
ss_size);
00522 cout <<
"|train| = " <<
wsd_train.
length()<<
endl;
00523
00524
00525
for (i = 0; i <
wsd_train.
length(); i++){
00526
wsd_train->getRow(i, row_data);
00527 e = (
int)row_data[
n_fields-3];
00528 s = (
int) row_data[
n_fields-2];
00529 pos = (
int)row_data[
n_fields-1];
00530
00531
00532
if (e<0 || e == oov_id)
continue;
00533
if (s>0 &&
ontology.
isWord(e)&&
ontology.
isSense(s)){
00534
if (pos!=
NOUN_TYPE)
continue;
00535
00536
nSE.
incr(s,e);
00537
pS[s]++;
00538 }
00539 }
00540
pS.
normalize();
00541
pSE.
normalizeCond(
nSE,
false);
00542
00543
sensemap.clear();
00544
if(
sensemap_level>0)
compute_nodemap(
sensemap_level);
00545
00546
00547
pS.
clear();
00548
nSE.
clear();
00549
00550
for (i = 0; i <
wsd_train.
length(); i++){
00551
wsd_train->getRow(i, row_data);
00552 e = (
int)row_data[
n_fields-3];
00553 si = (
int) row_data[
n_fields-2];
00554
00555 s = si;
00556 skey =
ontology.
getSenseKey(e,si);
00557
if (
sensemap_level>0 && si>0 &&
sensemap.find(skey)!=
sensemap.end()){
00558 s=
ontology.
getSynsetIDForSenseKey(e,
sensemap[skey]);
00559 nbMap++;
00560 }
00561
00562 pos = (
int)row_data[
n_fields-1];
00563
00564
if (e<0 || e == oov_id)
continue;
00565
if (s>0 &&
ontology.
isWord(e)&&
ontology.
isSense(s)){
00566
if (pos!=
NOUN_TYPE)
continue;
00567
00568
nES.
incr(e, s);
00569
pS[s]++;
00570
nSE.
incr(s,e);
00571 }
00572 }
00573 cout <<
"INIT "<<nbMap<<
" mapping done"<<
endl;
00574
00575
00576
pES.
normalizeCond(
nES,
false);
00577
pS.
normalize();
00578
pSE.
normalizeCond(
nSE,
false);
00579
00580
00581
00582
compute_pTC();
00583
00584
00585
00586
00587 progress.
set(0,
train_bitext_tgt.
size(),
"INIT_initialize_nFS_nEF", 50);
00588 progress.
init();
00589 progress.
draw();
00590
00591
for (i=0;i<
train_bitext_tgt.
size();i++){
00592 tgt_word_id=(
int)
train_bitext_tgt[i];
00593 src_word_id=(
int)
train_bitext_src[i];
00594
00595
00596
if(
ontology.
isWord(src_word_id) &&
target_wsd_voc.
find(tgt_word_id)!=
target_wsd_voc.
end()){
00597
00598
00599
nEF.
incr(src_word_id,tgt_word_id);
00600
pE[src_word_id]++;
00601
pF[tgt_word_id]++;
00602
00603
00604
00605
00606
00607 src_senses =
ontology.
getWordNounSenses(src_word_id);
00608
00609
00610 maxp=0;
00611 maxs=0;
00612
for (sit = src_senses.
begin(); sit != src_senses.
end(); ++sit){
00613
real src_sense_proba =
pES.
get(src_word_id,*sit);
00614
00615
if (src_sense_proba!=0){
00616
00617
00618
00619
00620
nFS.
set(tgt_word_id,*sit,1);
00621
00622
00623
00624
target_word_to_senses[tgt_word_id].insert(*sit);
00625
00626
00627
00628
00629
00630
00631 }
00632
00633
00634
00635
00636
00637
00638
00639 }
00640
00641 }
00642 progress.
update(i);
00643 }
00644 progress.
done();
00645
00646
00647
00648
00649 cout <<
"Init:attach french words"<<
endl;
00650
compute_node_level();
00651
for ( f = 0; f<
target_wsd_voc_size;f++){
00652 cout<<
target_id_to_word[f]<<
endl;
00653
00654
if(
nEF.
sumCol(f)==1){
00655
00656
00657 map<int, real>& col_f =
nEF.
getCol(f);
00658 map<int, real>::iterator it = col_f.begin();
00659 e = it->first;
00660 maxp=0;
00661 maxs=0;
00662
for(
set<int>::iterator lit1=f_possible_senses[f].
begin(); lit1 != f_possible_senses[f].end(); lit1++){
00663 s = *lit1;
00664 p =
pES(e,s)*
pS[s];
00665
if(maxp<p){
00666 maxp = p;
00667 maxs = i;
00668 }
00669 }
00670 }
else{
00671
00672
00673 symscore.
clear();
00674
for(
set<int>::iterator lit1=f_possible_senses[f].
begin(); lit1 != f_possible_senses[f].end(); lit1++){
00675 i = *lit1;
00676
for(
set<int>::iterator lit2=f_possible_senses[f].begin(); lit2 != f_possible_senses[f].end(); lit2++){
00677 j = *lit2;
00678
if(i==j)
continue;
00679 c =
getDeepestCommonAncestor(i,j);
00680 symscore[i]+= -
log(
pTC[c]);
00681
00682
00683
00684 }
00685 }
00686
int nb_fr_sense =10;
00687
for(i=0;i<nb_fr_sense;i++){
00688
if(symscore.
size()!=0){
00689 si =
argmax(symscore);
00690 symscore[si]=0;
00691
if(si!=0){
00692 cout <<target_id_to_word[f]<<
" argmax="<<si<<
" ";
ontology.
printSynset(si);
00693
nFS.
set(f,si,1);
00694
00695
target_word_to_senses[f].insert(si);
00696 }
00697 }
00698 }
00699 }
00700 }
00701
00702
pFS.
normalizeCond(
nFS,
false);
00703
00704
00705
00706
int deepestComNode;
00707
Set e_senses;
00708
Set f_senses;
00709 progress.
set(0,
source_wsd_voc_size*target_wsd_voc_size ,
"INIT_compute_commNode", 50);
00710 progress.
init();
00711 progress.
draw();
00712 i = 0;
00713
00714
Set e_words=
ontology.
getAllWords();
00715
for (sit = e_words.
begin(); sit != e_words.
end(); ++sit){
00716 e = *sit;
00717
00718 e_senses =
ontology.
getWordNounSenses(e);
00719
00720
00721
00722
for ( f = 0; f<target_wsd_voc_size;f++){
00723
00724 f_senses =
target_word_to_senses[f];
00725
00726
for(
SetIterator esit=e_senses.
begin(); esit!=e_senses.
end();++esit){
00727
if (
pES.
get(e,*esit)==0)
continue;
00728
00729
for (
SetIterator fsit = f_senses.
begin(); fsit != f_senses.
end(); ++fsit){
00730
if (
pFS.
get(f,*fsit)==0)
continue;
00731 deepestComNode =
getDeepestCommonAncestor(*esit,*fsit);
00732
00733
commNode(e,f).insert(deepestComNode);
00734
00735
sens_to_conceptAncestors[*esit].insert(deepestComNode);
00736
sens_to_conceptAncestors[*fsit].insert(deepestComNode);
00737
if (
pTC[deepestComNode]==0)
PLERROR(
"compute_commNode : pTC[common ancestor]==0");
00738
00739
pA[deepestComNode]=
INIT_P_A;
00740 }
00741 }
00742 i++;
00743 progress.
update(i);
00744
00745 }
00746 }
00747 progress.
done();
00748
00749
00750
00751
check_set_pA();
00752
compute_pMC();
00753
00754
00755
pEF.
normalizeJoint(
nEF);
00756
00757
pE.
smoothNormalize(
"pE");
00758
pF.
smoothNormalize(
"pF");
00759
00760
00761
check_consitency();
00762
00763
return;
00764
00765 }
00766 int GraphicalBiText::getDeepestCommonAncestor(
int s1,
int s2)
00767 {
00768 list<int> candidates;
00769
int cand;
00770
Node* candNode;
00771
Node* ss2 ;
00772
SetIterator it;
00773
Set s1_ancestors;
00774
00775
00776
00777
00778
00779 s1_ancestors =
ontology.
getSynsetAncestors ( s1,-1);
00780
00781
if (
pTC[s1]!=0){
00782 s1_ancestors.
insert(s1);
00783 }
00784
00785
if(
pTC[s2]!=0){
00786 candidates.push_back(s2);
00787 }
00788
00789 ss2 =
ontology.
getSynset(s2);
00790
00791
for (it = ss2->
parents.
begin(); it != ss2->
parents.
end(); ++it){
00792 candidates.push_back(*it);
00793 }
00794
00795
while(!candidates.empty()){
00796 cand = candidates.front();
00797 candidates.pop_front();
00798
if (s1_ancestors.
find(cand)!=s1_ancestors.
end()){
00799
return cand;
00800 }
else{
00801 candNode =
ontology.
getSynset(cand);
00802
00803
for (it = candNode->
parents.
begin(); it != candNode->
parents.
end(); ++it){
00804 candidates.push_back(*it);
00805 }
00806 }
00807 }
00808
PLERROR(
"No common ancestor for %d and %d",s1,s2);
00809
return 0;
00810 }
00811
00812
00813
00814 void GraphicalBiText::compute_pTC()
00815 {
00816
SetIterator sit;
00817
pTC.
clear();
00818
Set ss_set=
ontology.
getAllCategories();
00819
int s;
00820
00821
for (sit = ss_set.
begin(); sit != ss_set.
end(); ++sit){
00822 s = *sit;
00823
if (
ontology.
isPureCategory(s))
continue;
00824
if (
ontology.
isPureSense(s)){
00825
pTC[s]=0;
00826 }
else{
00827
00828
pTC[s]+=
pS[s];
00829 }
00830
distribute_pS_on_ancestors(s,
pS[s]);
00831 }
00832 }
00833
00834
00835
00836
00837
00838 void GraphicalBiText::compute_pTC(
int word)
00839 {
00840
SetIterator sit;
00841
pTC.
clear();
00842
Set w_senses ;
00843
Set ss_set=
ontology.
getAllCategories();
00844
int s;
00845
00846
for (sit = ss_set.
begin(); sit != ss_set.
end(); ++sit){
00847 s = *sit;
00848
if (
ontology.
isPureCategory(s))
continue;
00849 w_senses =
ontology.
getWordSenses(word);
00850
if(w_senses.
find(s)==w_senses.
end())
continue;
00851
if (
ontology.
isPureSense(s)){
00852
pTC[s]=0;
00853 }
else{
00854
00855
pTC[s]+=
pS[s];
00856 }
00857
distribute_pS_on_ancestors(s,
pS[s]);
00858 }
00859 }
00860
00861
00862
00863 void GraphicalBiText::distribute_pS_on_ancestors(
int s,
real probaToDistribute)
00864 {
00865
real proba;
00866
Set ss_anc;
00867
SetIterator sit;
00868 ss_anc =
ontology.
getSynsetParents(s);
00869
00870 proba = probaToDistribute/ss_anc.
size();
00871
for ( sit = ss_anc.
begin(); sit != ss_anc.
end(); ++sit){
00872
pTC[*sit]+=proba;
00873
distribute_pS_on_ancestors(*sit,proba);
00874 }
00875 }
00876
00877
00878
00879
00880
00881
00882
00883
00884
00885
00886
00887
00888
00889
00890
00891
00892
00893
00894
00895
00896
00897
00898
00899
00900
00901
00902
00903
00904
00905
00906
00907
00908 void GraphicalBiText::compute_node_level()
00909 {
00910 list<int> desc;
00911
SetIterator sit,ssit;
00912
Set ss_anc;
00913
Node *node;
00914
bool incomplete;
00915
int s, max_level,par;
00916 node =
ontology.
getSynset(
ROOT_SS_ID);
00917
00918
for (sit = node->
children.
begin(); sit != node->
children.
end(); ++sit){
00919
if (
pTC[*sit]==0)
continue;
00920 desc.push_back(*sit);
00921
00922 }
00923
node_level[
ROOT_SS_ID]=1;
00924
for(list<int>::iterator lit=desc.begin(); lit != desc.end(); lit++){
00925 s = *lit;
00926
00927
if(
pMC[s]!=0)
continue;
00928
00929
if (
pTC[s]==0)
continue;
00930
00931 node =
ontology.
getSynset(s);
00932 ss_anc.
clear();
00933
ontology. extractAncestors(node, ss_anc, 1, 1);
00934 max_level = 0;
00935 incomplete=0;
00936
for (ssit = ss_anc.
begin(); ssit != ss_anc.
end(); ++ssit){
00937 par = *ssit;
00938
if (
node_level[par]==0){
PLWARNING(
"tried to compute level for a node (%d) and level for its parent (%d) is not computed",s,*ssit);
00939 incomplete=
true;
00940
break;
00941 }
00942
if (
node_level[par]>max_level)max_level =
node_level[par];
00943 }
00944
if(!incomplete){
00945
node_level[s]=max_level+1;
00946 node =
ontology.
getSynset(s);
00947
00948
for (sit = node->
children.
begin(); sit != node->
children.
end(); ++sit){
00949
if (!
ontology.
isSynset(*sit))
continue;
00950 desc.push_back(*sit);
00951
00952 }
00953 }
else{
00954
00955 desc.push_back(s);
00956 }
00957 }
00958 }
00959
00960 void GraphicalBiText::compute_pMC()
00961 {
00962 list<int> desc;
00963
SetIterator sit,ssit;
00964
Set ss_anc;
00965
Node *node,*node_par;
00966
bool incomplete;
00967
int s,par;
00968
real proba_mass;
00969
real sum_pTC_par;
00970
real check_sum=0;
00971
00972
pMC.
clear();
00973
pC.
clear();
00974 node =
ontology.
getSynset(
ROOT_SS_ID);
00975
00976
for (sit = node->
children.
begin(); sit != node->
children.
end(); ++sit){
00977
if (
pTC[*sit]==0)
continue;
00978 desc.push_back(*sit);
00979
00980 }
00981
00982
pMC[
ROOT_SS_ID]=1;
00983
pC[
ROOT_SS_ID]=
pMC[
ROOT_SS_ID]*
pA[
ROOT_SS_ID];
00984 check_sum=
pC[
ROOT_SS_ID];
00985
for(list<int>::iterator lit=desc.begin(); lit != desc.end(); lit++){
00986 incomplete =
false;
00987 s = *lit;
00988
00989
00990
00991
if(pMC[s]!=0)
continue;
00992
00993
if (
pTC[s]==0)
continue;
00994
00995 node =
ontology.
getSynset(s);
00996 ss_anc.
clear();
00997
ontology. extractAncestors(node, ss_anc, 1, 1);
00998 proba_mass = 0;
00999
for (ssit = ss_anc.
begin(); ssit != ss_anc.
end(); ++ssit){
01000 par = *ssit;
01001
if (pMC[par]==0){
PLWARNING(
"tried to compute pMC for a node (%d) and pMC for its parent (%d) is not computed",s,*ssit);
ontology.
printSynset(*ssit);incomplete=
true;
break;}
01002
01003 sum_pTC_par=0;
01004 node_par =
ontology.
getSynset(par);
01005
for (sit = node_par->
children.
begin(); sit != node_par->
children.
end(); ++sit){
01006 sum_pTC_par+=
pTC[*sit];
01007 }
01008 proba_mass+=pMC[par]*(1.0-pA[par])*
pTC[s]/sum_pTC_par;
01009
01010 }
01011
01012
if (incomplete){
01013
01014 pMC[s]=0;
01015 desc.push_back(s);
01016
01017 }
else{
01018 node =
ontology.
getSynset(s);
01019
01020
for (sit = node->
children.
begin(); sit != node->
children.
end(); ++sit){
01021
if (!
ontology.
isSynset(*sit))
continue;
01022 desc.push_back(*sit);
01023
01024 }
01025 pMC[s]=proba_mass;
01026 pC[s]=pMC[s]*pA[s];
01027 check_sum+= pC[s];
01028
01029
01030 }
01031 }
01032 }
01033
01034 bool lessPair ( pair<int,float>& p1, pair<int,float>& p2)
01035 {
01036
return p1.second < p2.second;
01037 }
01038
01039
01040 void GraphicalBiText::set_nodemap(
int c,
int e)
01041
01042
01043
01044 {
01045 list <int> desc;
01046
SetIterator sit;
01047
int s;
01048
int common_sense=0;
01049
Node *node;
01050
Set e_senses =
ontology.
getWordSenses(e);
01051 desc.push_back(c);
01052
for(list<int>::iterator lit=desc.begin(); lit != desc.end(); lit++){
01053 s = *lit;
01054
if (e_senses.
find(s)!=e_senses.
end() &&
pSE(s,e)!=0 ){
01055
01056
if(common_sense==0){
01057
01058 common_sense=s;
01059
sensemap[
ontology.
getSenseKey(e,s)]=
ontology.
getSenseKey(e,s);
01060
01061 }
else{
01062
01063
sensemap[
ontology.
getSenseKey(e,s)]=
ontology.
getSenseKey(e,common_sense);
01064 }
01065 cout << s<<
" "<<
pSE(s,e)<<
" "<<
ontology.
getSenseKey(e,s) <<
" -> "<<
sensemap[
ontology.
getSenseKey(e,s)]<<
endl;
01066 }
01067 node =
ontology.
getSynset(s);
01068
for (sit = node->
children.
begin(); sit != node->
children.
end(); ++sit){
01069
if (!
ontology.
isSynset(*sit))
continue;
01070
if (
pTC[*sit]==0 && (e_senses.
find(*sit)==e_senses.
end() ||
pSE(*sit,e)==0))
continue;
01071 desc.push_back(*sit);
01072 }
01073 }
01074 }
01075
01076 void GraphicalBiText::print_sensemap()
01077 {
01078
int e;
01079
SetIterator sit1,sit;
01080 cout <<
"Print_sensemap"<<
endl;
01081
01082
Set e_words=
ontology.
getAllWords();
01083
for ( sit1 = e_words.
begin(); sit1 != e_words.
end(); ++sit1){
01084 e = *sit1;
01085 cout <<
source_id_to_word[e]<<
endl;
01086
Set e_senses =
ontology.
getWordSenses(e);
01087
for (sit = e_senses.
begin(); sit != e_senses.
end(); ++sit){
01088
01089
01090
01091
01092
01093
01094 }
01095 }
01096 }
01097
01098 void GraphicalBiText::compute_nodemap(
int level)
01099
01100
01101
01102 {
01103 list<int> desc;
01104
SetIterator sit,ssit,sit1;
01105
Set ss_anc;
01106
Set e_senses;
01107 list<pair<int,float> > split_node;
01108
Node *node;
01109
int non_null_child;
01110
float max_level;
01111 map <int,float> split_level;
01112
int s,e;
01113 cout <<
"Compute_nodemap"<<
endl;
01114
01115
01116
01117
Set e_words=
ontology.
getAllWords();
01118
for (sit1 = e_words.
begin(); sit1 != e_words.
end(); ++sit1){
01119 e = *sit1;
01120
01121
compute_pTC(e);
01122 e_senses =
ontology.
getWordSenses(e);
01123
nodemap.clear();
01124 split_level.clear();
01125 split_node.clear();
01126 desc.clear();
01127 desc.push_back(
ROOT_SS_ID);
01128
for(list<int>::iterator lit=desc.begin(); lit != desc.end(); lit++){
01129 s = *lit;
01130 node =
ontology.
getSynset(s);
01131
01132
01133
if(e_senses.
find(s)!=e_senses.
end() &&
pSE(s,e)!=0){
01134 non_null_child=2;
01135
01136 }
else{
01137 non_null_child=0;
01138
01139
for (sit = node->
children.
begin(); sit != node->
children.
end(); ++sit){
01140
if (!
ontology.
isSynset(*sit))
continue;
01141
if (
pTC[*sit]==0 && (
pSE(*sit,e)==0 || e_senses.
find(*sit)==e_senses.
end()))
continue;
01142 desc.push_back(*sit);
01143 non_null_child++;
01144 }
01145 }
01146
01147
if(s==
ROOT_SS_ID){
01148 max_level=0;
01149 }
else{
01150
01151
01152 ss_anc.
clear();
01153 max_level =0;
01154
ontology.
extractAncestors(node, ss_anc, 1, 1);
01155
for (ssit = ss_anc.
begin(); ssit != ss_anc.
end(); ++ssit){
01156
if (split_level[*ssit]>max_level)max_level = split_level[*ssit];
01157 }
01158 }
01159
if(non_null_child>=2){
01160
01161 split_level[s]=max_level+1.0;
01162 split_node.push_back(make_pair(s,max_level+1.0));
01163
01164
01165
01166 }
else{
01167
01168 split_level[s]=max_level;
01169
01170
01171
01172
01173 }
01174
01175 }
01176
01177
for (sit = e_senses.
begin(); sit != e_senses.
end(); ++sit){
01178
sensemap[
ontology.
getSenseKey(e,*sit)]=
ontology.
getSenseKey(e,*sit);
01179 }
01180
for(list<pair<int,float> >::iterator lit=split_node.begin(); lit != split_node.end(); lit++){
01181
01182
if(lit->second==level){
01183
set_nodemap(lit->first,e);
01184 }
01185 }
01186 }
01187 }
01188
01189 void GraphicalBiText::check_set_pA()
01190 {
01191
01192
real sum_TC;
01193
SetIterator sit,ssit;
01194
Set ss_desc;
01195
Set ss_set=
ontology.
getAllCategories();
01196
Node* node,*childnode;
01197
01198
for (sit = ss_set.
begin(); sit != ss_set.
end(); ++sit){
01199
01200
if(
pTC[*sit]==0)
continue;
01201
01202
01203 sum_TC=0;
01204 node =
ontology.
getSynset(*sit);
01205
for (ssit = node->
children.
begin(); ssit != node->
children.
end(); ++ssit){
01206
01207
01208
01209
01210
01211
if(
pTC[*ssit]!=0 &&
pS[*ssit]!=0)
continue;
01212
01213 childnode =
ontology.
getSynset(*ssit);
01214
01215
01216
01217 sum_TC+=
pS[*ssit]/childnode->
parents.
size();
01218 }
01219
01220
if (
pTC[*sit]!=0 &&
pS[*sit]!=0) sum_TC+=
pS[*sit];
01221
01222
01223
if (sum_TC!=0)
pA[*sit]=sum_TC/
pTC[*sit];
01224
01225
01226
01227
01228
01229
01230
01231
01232
01233
01234 }
01235 }
01236
01237 void GraphicalBiText::check_consitency()
01238 {
01239 cout <<
"Consistency checking :";
01240 cout <<
" / pS-1 : "<<
sum(
pS)-1.0;
01241 cout <<
" / pSbase-1 : "<<
sum(
pSbase)-1.0;
01242 cout <<
" / pMC : "<<
sum(
pMC);
01243 cout <<
" / pTC : "<<
sum(
pTC);
01244 cout <<
" / pA : "<<
sum(
pA);
01245 cout <<
" / pC-1 : "<<
sum(
pC)-1.0;
01246 cout <<
" / pF-1 : "<<
sum(
pF)-1.0;
01247 cout <<
" / pE-1 : "<<
sum(
pE)-1.0;
01248 cout <<
" / pH-1 : " <<
sum(
pH)-1.0;
01249 cout <<
" / pHupbi-1 : " <<
sum(
pHupbi)-1.0;
01250 cout <<
" / pFS : "<<
pFS.
checkCondProbIntegrity();
01251 cout <<
" / pES : "<<
pES.
checkCondProbIntegrity();
01252 cout <<
" / pHSupbi : "<<
pHSupbi.
checkCondProbIntegrity();
01253 cout <<
" / pHS : "<<
pHS.
checkCondProbIntegrity();
01254 cout <<
" / pEF-1 : "<<
pEF.
sumOfElements() - 1.0 <<
endl;
01255
01256 }
01257
01258 void GraphicalBiText::print(
string name)
01259 {
01260
real proba;
01261
real like_sum=0;
01262
real efs_sum;
01263
int e,f,
k,s;
01264
TVec<int> e_senses ;
01265
SetIterator sit;
01266
int e_voc_size =
ontology.
getVocSize();
01267
string filename =
output_dir+
"out_gra"+name;
01268 ofstream out_gra (filename.c_str());
01269
if (!out_gra.is_open()){
PLERROR(
"error printing hierarchy");}
01270
01271
01272
ShellProgressBar progress(0,e_voc_size ,
"e_f_s_probabilities", 50);
01273 progress.
init();
01274 progress.
draw();
01275
01276
Set e_words=
ontology.
getAllWords();
01277
for (sit = e_words.
begin(); sit != e_words.
end(); ++sit){
01278 e = *sit;
01279
for ( f = 0; f<
target_wsd_voc_size;f++){
01280 e_senses =
ontology.
getSensesForWord(e);
01281 like_sum+=
compute_BN_likelihood(e,f,0,1);
01282 efs_sum=0;
01283
for (
k = 0;
k < e_senses.
size();
k++){
01284 s = e_senses[
k];
01285 proba =
compute_efs_likelihood(e,f,s);
01286 efs_sum+=proba;
01287 out_gra <<
target_id_to_word[f] <<
"\t"<<
source_id_to_word[e]<<
"\t"<<proba <<
"\t"<<
ontology.
getSenseKey(e,s);
01288
01289 }
01290
if (efs_sum-1.0>
PROB_PREC)
PLERROR(
"print : efs doesn't sum to 1 for (%d,%d)",e,f);
01291 }
01292 progress.
update(e);
01293 }
01294 progress.
done();
01295 cout <<
" checksum likelihood-1.0 : " <<like_sum-1.0<<
endl;
01296
Set ss_set=
ontology.
getAllCategories();
01297
01298
01299
01300
01301 }
01302 void GraphicalBiText::printHierarchy(
string name)
01303 {
01304
string filename =
"/u/kermorvc/HTML/Treebolic/hierarchy"+name+
".xml";
01305 ofstream out_hie (filename.c_str());
01306
if (!out_hie.is_open()){
PLERROR(
"error printing hierarchy");}
01307
01308 out_hie <<
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE treebolic SYSTEM \"Treebolic.dtd\">\n<treebolic toolbar=\"true\" focus-on-hover=\"false\"><tree orientation=\"radial\" expansion=\"0.9\" sweep=\"1.2\" backcolor=\"fffff0\" fontface=\"true\" fontsize=\"20\" fontsizestep=\"2\">\n";
01309
printNode(
ROOT_SS_ID,out_hie);
01310 out_hie <<
"<edges></edges></tree></treebolic>";
01311
01312 }
01313
01314 void GraphicalBiText::printNode(
int ss,ostream &out_hie)
01315 {
01316
SetIterator sit,ssit;
01317
Set word_set;
01318
int word;
01319
Node * node =
ontology.
getSynset(ss);
01320
string color;
01321
if (
pTC[ss]==0){
01322
color=
"cc0099";
01323 }
else if (
pC[ss]==0){
01324
color=
"99ffff";
01325 }
else{
01326
color=
"0000ff";
01327 }
01328 out_hie <<
"<node id=\""<<ss<<
"\" backcolor=\""<<
color<<
"\" forecolor=\"ffffff\">"<<
endl;
01329 out_hie <<
"<label>"<<node->
syns<<
"</label>"<<
endl;
01330 out_hie<<
"<content> pC="<<
pC[ss]<<
" pMC="<<
pMC[ss]<<
" pTC=" <<
pTC[ss]<<
" pA="<<
pA[ss]<<
" pS="<<
pS[ss]<<
" ss="<< ss<<node->
gloss<<
endl;
01331 out_hie <<
"</content>";
01332
01333
01334
if(pS[ss]!=0 && pTC[ss]!=0){
01335
01336 out_hie <<
"<node id=\""<<ss<<
"\" backcolor=\"ff33cc\" forecolor=\"ffffff\">"<<
endl;
01337 out_hie <<
"<label>"<<node->
syns<<
"</label>"<<
endl;
01338 out_hie<<
"<content> pS="<< pS[ss]<<
" ss="<< ss<<node->
gloss<<
endl;
01339 out_hie <<
"</content>";
01340 }
01341
01342 word_set =
ontology.
getSynsetWordDescendants(ss);
01343
for (ssit = word_set.
begin(); ssit != word_set.
end(); ++ssit){
01344 word = *ssit;
01345
if (
pES.
get(word,ss)!=0){
01346 out_hie <<
"<node id=\"w"<<word<<
"\" backcolor=\"ff9050\" forecolor=\"ffffff\">"<<
endl;
01347 out_hie <<
"<label> "<<
source_id_to_word[word]<<
"</label>"<<
endl;
01348 out_hie<<
"<content>"<<
pES.
get(word,ss)<<
" id="<<word<<
"</content>" <<
endl;
01349 out_hie <<
"</node>"<<
endl;
01350 }
01351 }
01352
01353
01354
for (ssit =
target_wsd_voc.
begin(); ssit !=
target_wsd_voc.
end(); ++ssit){
01355 word = *ssit;
01356
if(
pFS.
get(word,ss)!=0){
01357
01358 out_hie <<
"<node id=\"w"<<word<<
"\" backcolor=\"00EE00\" forecolor=\"ffffff\">"<<
endl;
01359 out_hie <<
"<label> "<<
target_id_to_word[word]<<
"</label>"<<
endl;
01360 out_hie<<
"<content>"<<
pFS.
get(word,ss)<<
" id="<< word <<
"</content>" <<
endl;
01361 out_hie <<
"</node>"<<
endl;
01362 }
01363
01364 }
01365
01366
if(pS[ss]!=0 && pTC[ss]!=0){out_hie <<
" </node>"<<
endl; }
01367
01368
for (sit = node->
children.
begin(); sit != node->
children.
end(); ++sit){
01369
01370
if (pTC[*sit]!=0||pS[*sit]!=0){
01371
printNode(*sit,out_hie);
01372 }
01373 }
01374 out_hie <<
" </node>"<<
endl;
01375 }
01376
01377
01378 void GraphicalBiText::update_WSD_model(
string name)
01379 {
01380
TVec<int> e_senses;
01381
int i,j,
k,h,e,f;
01382
real proba;
01383
int nbsent=0;
01384
nHSupbi.
clear();
01385
pHSupbi.
clear();
01386
01387
01388
01389
01390
01391
01392
01393
nESupbi.
clear();
01394
01395
for (
int j = 0; j <
nESbase.
getWidth(); j++){
01396 map<int, real>& col_j =
nESbase.
getCol(j);
01397
for (map<int, real>::iterator it = col_j.begin(); it != col_j.end(); ++it){
01398
if( it->second!=0)
nESupbi.
set(it->first,j, it->second);
01399 }
01400 }
01401
01402
01403
pHupbi.
clear();
01404
01405
01406
int point_index =
source_word_to_id[
tostring(
".")];
01407
01408
string filename =
output_dir+
"/out_bi"+name;
01409 ofstream out_bi (filename.c_str());
01410
if (!out_bi.is_open()){
PLERROR(
"error while out_bi");}
01411
ShellProgressBar progress(0,
train_bitext_src.
size()- 1,
"Updating_WSD_model ", 50);
01412 progress.
init();
01413 progress.
draw();
01414
01415
for(i=0;i<
train_bitext_src.
size()-1;i++){
01416 e = (
int)
train_bitext_src[i];
01417 f = (
int)
train_bitext_tgt[i];
01418
if(e==point_index){
01419 nbsent++;
01420
continue;
01421 }
01422
01423
01424
if (
ontology.
isWord(e)&&
target_wsd_voc.
find(f)!=
target_wsd_voc.
end()){
01425 e_senses =
ontology.
getSensesForWord(e);
01426
for (
k = 0;
k < e_senses.
size();
k++){
01427
int s = e_senses[
k];
01428
01429 proba =
compute_efs_likelihood(e,f,s);
01430
01431
if(proba>
update_threshold){
01432 out_bi <<
target_id_to_word[f] <<
"\t"<<
source_id_to_word[e]<<
"\t"<<
sensemap[
ontology.
getSenseKey(e,s)]<<
"\t"<<proba <<
endl;
01433
if(proba!=0){
01434
01435
for(j=1;j<=
window_size;j++){
01436 h = (
int)
train_bitext_src[i+j];
01437
if(h==point_index)
break;
01438
01439
pHupbi[h]++;
01440
nHSupbi.
incr(h,s,proba);
01441 }
01442
01443
for(j=1;j<=window_size;j++){
01444 h = (
int)
train_bitext_src[i-j];
01445
if(h==point_index)
break;
01446
01447
pHupbi[h]++;
01448
nHSupbi.
incr(h,s,proba);
01449 }
01450 }
01451 }
01452 }
01453 }
else{
01454 out_bi <<
target_id_to_word[f] <<
"\t"<<
source_id_to_word[e]<<
endl;
01455 }
01456
01457 progress.
update(i);
01458 }
01459 progress.
done();
01460 cout<<
"Updating WSD model : "<< nbsent<<
" sentences processed" <<
endl;
01461
01462
pHupbi.
smoothNormalize(
"pHupbi");
01463
pHSupbi.
normalizeCondBackoff(
nHSupbi, 0.1,
pHupbi,
false,
false);
01464
pESupbi.
normalizeCondBackoff(
nESupbi, 0.1,
pEbase,
false,
false);
01465 }
01466
01467 void GraphicalBiText::senseTagBitext(
string name)
01468 {
01469
TVec<int> e_senses;
01470
int i,
k,ie,e,f;
01471
real proba=0;
01472
int sent_b,sent_e;
01473
01474
01475
01476
01477
01478 sent_b = 0;
01479 sent_e = 0;
01480 i =0;
01481
01482
01483
string filename =
output_dir+
"out_bi"+name;
01484 ofstream out_bi (filename.c_str());
01485
if (!out_bi.is_open()){
PLERROR(
"error while out_bi");}
01486
01487
ShellProgressBar progress(0,
train_bitext_src.
size()- 1,
"SenseTagBitext", 50);
01488 progress.
init();
01489 progress.
draw();
01490
01491
01492
for(i=0;i<
train_bitext_src.
size();i++){
01493 e = (
int)
train_bitext_src[ie];
01494 f = (
int)
train_bitext_tgt[ie];
01495
01496 out_bi <<
target_id_to_word[f] <<
"\t"<<
source_id_to_word[e]<<
endl;
01497
01498
01499
if (
ontology.
isWord( (
int)
train_bitext_src[i])&&
target_wsd_voc.
find((
int)
train_bitext_tgt[i])!=
target_wsd_voc.
end()){
01500 e_senses =
ontology.
getSensesForWord((
int)
train_bitext_src[i]);
01501
for (
k = 0;
k < e_senses.
size();
k++){
01502
int s = e_senses[
k];
01503
01504 proba =
compute_efs_likelihood(e,f,s);
01505 out_bi <<target_id_to_word[f] <<
"\t"<< source_id_to_word[e]<<
"\t"<<proba <<
"\t"<<
ontology.
getSenseKey(e,s)<<
"\t"<<s<<
endl;
01506 }
01507 }
01508 progress.
update(i);
01509 }
01510 progress.
done();
01511
01512 }
01513
01514
01515 void GraphicalBiText::sensetag_valid_bitext(
string name)
01516 {
01517
TVec<int> e_senses;
01518
int i,
k,maxs,e,f;
01519
real proba=0,ps;
01520
01521
string filename =
output_dir+
"out_bi"+name;
01522 ofstream out_bi (filename.c_str());
01523
if (!out_bi.is_open()){
PLERROR(
"error while out_bi");}
01524
01525
ShellProgressBar progress(0,
valid_bitext_src.
size()- 1,
"Sensetag_valid_bitext ", 50);
01526 progress.
init();
01527 progress.
draw();
01528
01529
for (i=0;i<
valid_bitext_src.
size();i++){
01530 e = (
int)
valid_bitext_src[i];
01531 f = (
int)
valid_bitext_tgt[i];
01532
01533
if (
ontology.
isWord(e)&&
target_wsd_voc.
find(f)!=
target_wsd_voc.
end()){
01534 maxs = -1;
01535 ps = 0;
01536 e_senses =
ontology.
getSensesForWord(e);
01537
for (
k = 0;
k < e_senses.
size();
k++){
01538
int s = e_senses[
k];
01539
01540 proba =
compute_efs_likelihood(e,f,s);
01541
01542
01543
if (proba>ps){
01544 ps = proba;
01545 maxs = s;
01546 }
01547
01548 }
01549 out_bi <<
target_id_to_word[f] <<
"\t"<<
source_id_to_word[e]<<
"\t"<<ps <<
endl;
01550 progress.
update(i);
01551 }
01552 }
01553 progress.
done();
01554 }
01555
01556
01557 real GraphicalBiText::compute_efs_likelihood(
int e,
int f,
int se)
01558 {
01559
01560
int s,c;
01561
real pws;
01562
real post;
01563
real like=0;
01564
Vec peC;
01565
Vec pfC;
01566
Set ss_anc;
01567
SetIterator sit,ssit;
01568
Set ss_adm;
01569 set <int>ss_admAnc ;
01570
Set synsets;
01571
01572
01573 peC.
resize(
ss_size);
01574 pfC.
resize(
ss_size);
01575
01576
01577 synsets=
ontology.
getAllCategories();
01578
01579
for (sit = synsets.
begin(); sit != synsets.
end(); ++sit){
01580 s = *sit;
01581 pws =
pES.
get(e,s);
01582
if (pws!=0){
01583 ss_anc =
ontology.
getSynsetAncestors(s);
01584
01585
if (
pTC[s]!=0){
01586 ss_anc.
insert(s);
01587 }
01588
01589
for ( ssit = ss_anc.
begin(); ssit != ss_anc.
end(); ++ssit){
01590 c = *ssit;
01591 peC[c]+=pws*
pS[s];
01592 }
01593 }
01594 }
01595
01596 synsets =
target_word_to_senses[f];
01597
for (sit = synsets.
begin(); sit != synsets.
end(); ++sit){
01598 s = *sit;
01599 pws =
pFS.
get(f,s);
01600
if (pws!=0){
01601 ss_anc =
ontology.
getSynsetAncestors(s);
01602
01603
if (
pTC[s]!=0){
01604 ss_anc.
insert(s);
01605 }
01606
01607
01608
for ( ssit = ss_anc.
begin(); ssit != ss_anc.
end(); ++ssit){
01609 c = *ssit;
01610 pfC[c]+=pws*
pS[s];
01611 }
01612 }
01613 }
01614 ss_adm =
commNode(e,f);
01615
for( ssit = ss_adm.
begin();ssit != ss_adm.
end();++ssit){
01616 c = *ssit;
01617
01618
01619
if (peC[c]!=0){
01620
if (
pTC[c]==0){
PLERROR(
"compute_BN_likelihood : division by zero leC/pTC");}
01621 peC[c]/=
pTC[c];
01622 }
01623 }
01624
for( ssit = ss_adm.
begin();ssit != ss_adm.
end();++ssit){
01625 c = *ssit;
01626
01627
01628
if (pfC[c]!=0){
01629
if (
pTC[c]==0){
PLERROR(
"compute_BN_likelihood : division by zero lfC/pTC");}
01630 pfC[c]/=
pTC[c];
01631 }
01632 }
01633
01634
for( ssit = ss_adm.
begin();ssit != ss_adm.
end();++ssit){
01635 c = *ssit;
01636
01637
01638
01639 like+=peC[c]*pfC[c]*
pC[c];
01640 }
01641
01642
01643
01644 post=0;
01645
if (like!=0){
01646 ss_anc =
ontology.
getSynsetAncestors(se);
01647
01648
if (
pTC[se]!=0){
01649 ss_anc.
insert(se);
01650 }
01651 ss_adm = commNode(e,f);
01652 set_intersection(ss_anc.
begin(),ss_anc.
end(),ss_adm.
begin(),ss_adm.
end(),inserter( ss_admAnc, ss_admAnc.begin() ));
01653 pws =
pES.
get(e,se);
01654
if (pws!=0){
01655
01656
for ( ssit = ss_admAnc.begin(); ssit != ss_admAnc.end(); ++ssit){
01657 c = *ssit;
01658
01659
01660
if(ss_anc.
find(c)==ss_anc.
end())
continue;
01661 post +=
pC[c]*pws*
pS[se]/
pTC[c]*pfC[c]/like;
01662
01663 }
01664 }
01665 }
01666
01667
return post;
01668 }
01669 void GraphicalBiText::test_WSD(
VMat test_set,
string name,
TVec<string> v,
bool select,
real interp)
01670 {
01671
01672
01673
int e,s,target,pos,smax,smaxb,smaxs,h;
01674
real nb_supervised=0;
01675
real nb_correct=0;
01676
real nb_single=0;
01677
real nb_unknown=0;
01678
real nb_undef=0;
01679
real nb_correctb=0;
01680
real nb_undefb=0;
01681
real nb_corrects=0;
01682
real nb_correctrandom=0;
01683
real nb_correctwn=0;
01684
real nb_undefs=0;
01685
real max,maxb,maxs,p,pupbi,ps,q,qb;
01686
int nbMap=0;
01687
01688
01689
Vec dMatch(
source_wsd_voc_size);
01690
Vec dMatchBi(
source_wsd_voc_size);
01691
Vec dMatchStup(
source_wsd_voc_size);
01692
Vec dNumber(
source_wsd_voc_size);
01693
if(!
select){
01694
BiSelect.clear();
01695 }
01696
if(
select)cout <<
"WSD_number_BiSelected "<<
BiSelect.size()<<
endl;
01697
01698
01699
Set source_words;
01700
SetIterator ssit;
01701
01702
string filename;
01703
real context_coeff;
01704
TVec<int> e_senses;
01705
int e_senses_size;
01706
int oov_id =
ontology.
getWordId(
OOV_TAG);
01707
string skey;
01708
int i,j,
k;
01709
ShellProgressBar progress;
01710
01711
string diff;
01712
int test_set_size = test_set->
length();
01713 cout <<
"WSD_"+name+
" size = " << test_set_size <<
endl;
01714
01715
01716 progress.
set(0, test_set_size,
"Predict "+name+
" senses", 50);
01717 progress.
init();
01718 progress.
draw();
01719
#ifdef PRINT_WSD
01720
filename =
output_dir+
"/out_wsd"+name;
01721 ofstream out_wsd (filename.c_str());
01722
if (!out_wsd.is_open()){
PLERROR(
"error while opening out_wsd");}
01723
#endif
01724
Vec row_data;
01725 row_data.
resize(
n_fields);
01726
for (i = 0; i < test_set_size; i++){
01727
01728 test_set->getRow(i, row_data);
01729
if (row_data.
size() !=
n_fields)
PLERROR(
"row_data[%d].size = %d, but n_fields = %d", i, row_data.
size(),
n_fields);
01730 e = (
int)row_data[
n_fields-3];
01731
#ifdef PRINT_WSD
01732
out_wsd <<
source_id_to_word[e]<<
" ";
01733
#endif
01734
01735
if (!
ontology.
isWord(e))
continue;
01736
01737 s = (
int) row_data[
n_fields-2];
01738
01739 skey =
ontology.
getSenseKey(e,s);
01740
if (
sensemap_level>0 && s>0 &&
sensemap.find(skey)!=
sensemap.end()){
01741 nbMap++;
01742 target=
ontology.
getSynsetIDForSenseKey(e,
sensemap[skey]);
01743
01744 }
else{
01745 target = s;
01746 }
01747 pos = (
int)row_data[
n_fields-1];
01748
if (pos!=
NOUN_TYPE)
continue;
01749
#ifdef PRINT_WSD
01750
out_wsd <<
" tar="<<target<<
" pos="<<pos<<
endl;
01751
#endif
01752
01753
01754
if (target>=0){
01755
01756
01757
if (1){
01758
switch (pos){
01759
case NOUN_TYPE:
01760 e_senses =
ontology.
temp_word_to_noun_senses[e];
01761
break;
01762
case VERB_TYPE:
01763 e_senses =
ontology.
temp_word_to_verb_senses[e];
01764
break;
01765
case ADJ_TYPE:
01766 e_senses =
ontology.
temp_word_to_adj_senses[e];
01767
break;
01768
case ADV_TYPE:
01769 e_senses =
ontology.
temp_word_to_adv_senses[e];
01770
break;
01771
case UNDEFINED_TYPE:
01772 e_senses =
ontology.
getSensesForWord(e);
01773
break;
01774
default:
01775
PLERROR(
"weird in train, pos = %d", pos);
01776 }
01777 }
else{
01778 e_senses =
ontology.
getSensesForWord(e);
01779 }
01780 e_senses_size = e_senses.
size();
01781
if (e_senses_size==0){
01782
01783 nb_unknown ++;
01784 v[(
int)nb_supervised] =
"-1";
01785 nb_supervised++;
01786
continue;
01787 }
01788
01789
01790
if (e_senses_size==1){
01791 nb_single++;
01792 v[(
int)nb_supervised] =
ontology.
getSenseKey(e,e_senses[0] );
01793 dNumber[e]++;
01794
01795 nb_supervised++;
01796
continue;
01797 }
01798
01799
01800 maxb = -FLT_MAX;
01801
max=-FLT_MAX;
01802 maxs=maxb;
01803 smax=-1;
01804 smaxb=-1;
01805 smaxs = smaxb;
01806
01807
for (j = 0; j < e_senses_size; j++){
01808
int s = e_senses[j];
01809 p =
log(
pESbase.
get(e,s))+
log(
pSbase[s]);
01810 pupbi = p;
01811 ps =p;
01812
#ifdef PRINT_WSD
01813
out_wsd <<
"pES="<<
pES.
get(e,s)<<
" pS="<<
pSbase[s];
01814
#endif
01815
if(
window_size!=0){
01816
01817 context_coeff = 1.0/(2*
window_size);
01818
01819
for (
k = 0;
k < 2 *
window_size;
k++){
01820 h = (
int)row_data[3*
k];
01821
#ifdef PRINT_WSD
01822
out_wsd <<
"/"<< source_id_to_word[h];
01823
#endif
01824
if (h==oov_id)
continue;
01825
01826 q=
pHS.
get(h,s);
01827 qb=
pHSupbi.
get(h,s);
01828
if(qb>1)
PLERROR(
"qb>1 %f",qb);
01829
01830 p += context_coeff*(
log(q));
01831
01832 pupbi +=context_coeff*(interp*
log(qb)+(1.0-interp)*
log(q));
01833
#ifdef PRINT_WSD
01834
out_wsd <<
","<<q<<
","<<qb;
01835
#endif
01836
}
01837 }
01838
#ifdef PRINT_WSD
01839
out_wsd <<
" s="<< s <<
" p="<<p<<
" pupbi="<<pupbi<<
endl;
01840
#endif
01841
if (p>
max){
max=p;smax = s;}
01842
if (pupbi>maxb){maxb=pupbi;smaxb = s;}
01843
if (ps>maxs){maxs=ps;smaxs = s;}
01844 }
01845
01846
01847
01848
if (
max==-FLT_MAX){
01849 nb_undef++;
01850
01851 smax = e_senses[0];
01852 }
01853
if (target==smax){
01854 nb_correct++;
01855 dMatch[e]++;
01856 }
01857
01858
01859
if (maxs==-FLT_MAX){
01860 nb_undefs++;
01861 smaxs = e_senses[0];
01862 }
01863
if (target==smaxs){
01864 nb_corrects++;
01865 dMatchStup[e]++;
01866 }
01867
01868 smaxs = e_senses[0];
01869
if (target==smaxs){
01870 nb_correctwn++;
01871 }
01872
01873 smaxs = e_senses[(
int)floor(rand()/(RAND_MAX+1.0)*(
float)e_senses.size())];
01874
01875
01876
if (target==smaxs){
01877 nb_correctrandom++;
01878 }
01879
01880
if (maxb==-FLT_MAX){
01881 nb_undefb++;
01882
01883 smaxb = e_senses[0];
01884 }
01885
01886
if (
select){
01887
if(
BiSelect.find(e)==
BiSelect.end())smaxb = smax;
01888
01889 }
01890
if (target==smaxb){
01891 nb_correctb++;
01892 dMatchBi[e]++;
01893 }
01894 v[(
int)nb_supervised] =
ontology.
getSenseKey(e, smaxb);
01895
#ifdef PRINT_WSD
01896
out_wsd <<
" best " <<source_id_to_word[e]<<
" e=" << e <<
" tar="<<target<<
" hyp="<<smaxb<<
" "<<
ontology.
getSenseKey(e, smaxb)<<
endl;
01897
#endif
01898
dNumber[e]++;
01899 nb_supervised++;
01900
01901 }
01902
#ifdef PRINT_WSD
01903
out_wsd <<
endl;
01904
#endif
01905
progress.
update(i);
01906 }
01907 progress.
done();
01908
01909
01910
01911 filename =
output_dir+
"out_score_"+name;
01912 ofstream out_score (filename.c_str());
01913
if (!out_score.is_open()){
PLERROR(
"error while opening out_score");}
01914 source_words =
ontology.
getAllWords();
01915
for (ssit = source_words.
begin(); ssit != source_words.
end(); ++ssit){
01916 e = *ssit;
01917
if (dNumber[e]==0)
continue;
01918
if(dMatch[e]<dMatchBi[e]){diff=
"+";}
else{diff=
"-";}
01919 out_score <<diff<<
"\t"<<
source_id_to_word[e]<<
"\t"<<dNumber[e]<<
"\t"<<dMatch[e]<<
"\t"<<dMatchBi[e]<<
"\t"<<dMatchStup[e]<<
endl;
01920
if(!
select && dMatch[e]<dMatchBi[e])
BiSelect[e]=
true;
01921 }
01922 out_score <<
"#WSD "<<nbMap<<
" mapping done"<<
endl;
01923 out_score <<
"#WSD "+name+
" Random correct :"<<nb_correctrandom<<
" / "<<nb_supervised<<
" = " << nb_correctrandom/nb_supervised*100 <<
endl;
01924 out_score <<
"#WSD "+name+
" StupidWordNet correct :"<<nb_correctwn<<
" / "<<nb_supervised<<
" = " << nb_correctwn/nb_supervised*100 <<
endl;
01925 out_score <<
"#WSD "+name+
" StupidBayes correct :"<<nb_corrects<<
" / "<<nb_supervised<<
" = " << nb_corrects/nb_supervised*100 <<
" % - " << nb_undefs <<
" undefined" <<
endl;
01926 out_score <<
"#WSD "+name+
" NaiveBayes correct :"<<nb_correct<<
" / "<<nb_supervised<<
" = " << nb_correct/nb_supervised*100 <<
" % - " << nb_undef <<
" undefined" <<
endl;
01927 out_score <<
"#WSD "+name+
" Bitext correct :"<< nb_correctb<<
" / "<<nb_supervised<<
" = " << nb_correctb/nb_supervised*100 <<
" % - " << nb_undefb <<
" undefined - " <<nb_single<<
" single sense words "<< nb_unknown <<
" unknown words " <<
endl;
01928 out_score.close();
01929
#ifdef PRINT_WSD
01930
out_wsd.close();
01931
#endif
01932
}
01933
01934 real GraphicalBiText::compute_BN_likelihood(
int e,
int f,
bool update,
real nb)
01935 {
01936
01937
int s,c,se;
01938
real p,pws;
01939
real like=0;
01940
real post,sumpost;
01941
01942
Vec peC;
01943
Vec pfC;
01944
Set ss_anc;
01945
SetIterator sit,ssit;
01946 peC.
resize(
ss_size);
01947 pfC.
resize(
ss_size);
01948
Set ss_adm;
01949 set <int>ss_admAnc ;
01950 ss_adm =
commNode(e,f);
01951
01952
Set synsets=
ontology.
getAllCategories();
01953
01954
for (sit = synsets.
begin(); sit != synsets.
end(); ++sit){
01955 s = *sit;
01956 pws =
pES.
get(e,s);
01957
if (pws!=0){
01958 ss_anc =
ontology.
getSynsetAncestors(s);
01959
01960
if (
pTC[s]!=0){
01961 ss_anc.
insert(s);
01962 }
01963
01964
for ( ssit = ss_anc.
begin(); ssit != ss_anc.
end(); ++ssit){
01965 c = *ssit;
01966 peC[c]+=pws*
pS[s];
01967 }
01968 }
01969 }
01970 synsets.
clear();
01971 ss_anc.
clear();
01972 synsets =
target_word_to_senses[f];
01973
01974
for (sit = synsets.
begin(); sit != synsets.
end(); ++sit){
01975 s = *sit;
01976 pws =
pFS.
get(f,s);
01977
if (pws!=0){
01978 ss_anc =
ontology.
getSynsetAncestors(s);
01979
01980
if (
pTC[s]!=0){
01981 ss_anc.
insert(s);
01982 }
01983
01984
for ( ssit = ss_anc.
begin(); ssit != ss_anc.
end(); ++ssit){
01985 c = *ssit;
01986 pfC[c]+=pws*
pS[s];
01987 }
01988 }
01989 }
01990
01991
for( ssit = ss_adm.
begin();ssit != ss_adm.
end();++ssit){
01992 c = *ssit;
01993
01994
01995
if (peC[c]!=0){
01996
if (
pTC[c]==0){
PLERROR(
"compute_BN_likelihood : division by zero leC/pTC");}
01997 peC[c]/=
pTC[c];
01998 }
01999 }
02000
02001
for( ssit = ss_adm.
begin();ssit != ss_adm.
end();++ssit){
02002 c = *ssit;
02003
02004
02005
if (pfC[c]!=0){
02006
if (
pTC[c]==0){
PLERROR(
"compute_BN_likelihood : division by zero lfC/pTC");}
02007 pfC[c]/=
pTC[c];
02008 }
02009 }
02010
02011
for( ssit = ss_adm.
begin();ssit != ss_adm.
end();++ssit){
02012 c = *ssit;
02013
02014
02015
02016 like+=peC[c]*pfC[c]*
pC[c];
02017
sum_epEC[c]+=peC[c];
02018
sum_fpFC[c]+=pfC[c];
02019
02020 }
02021
02022
if(
update){
02023
if (like!=0){
02024
real chk_up_pes=0;
02025
real chk_up_pfs=0;
02026
real chk_up_pc=0;
02027
real chk_up_ps=0;
02028
02029
02030
for( ssit = ss_adm.
begin();ssit != ss_adm.
end();++ssit){
02031 c = *ssit;
02032
02033
02034 p= peC[c]*pfC[c]*
pC[c]/like;
02035
if (p!=0)
nA[c]+=nb*p*
pA[c];
02036 chk_up_pc +=nb*p*pA[c];
02037 }
02038
if (chk_up_pc-nb>
PROB_PREC)
PLERROR(
"compute_BN_likelihood : inconsistent update for chk_pc = %f instead of %f",chk_up_pc,nb);
02039
02040
for (sit = synsets.
begin(); sit != synsets.
end(); ++sit){
02041 s = *sit;
02042 ss_anc =
ontology.
getSynsetAncestors(s);
02043
02044
if (
pTC[s]!=0){
02045 ss_anc.
insert(s);
02046 }
02047
02048
02049 ss_admAnc.clear();
02050 set_intersection(ss_anc.
begin(),ss_anc.
end(),ss_adm.
begin(),ss_adm.
end(),inserter( ss_admAnc, ss_admAnc.begin() ));
02051
02052
02053 pws =
pES.
get(e,s);
02054
if (pws!=0){
02055
02056
for ( ssit = ss_admAnc.begin(); ssit != ss_admAnc.end(); ++ssit){
02057 c = *ssit;
02058
02059
02060
if(ss_anc.
find(c)==ss_anc.
end())
continue;
02061 p =
pC[c]*pws*
pS[s]/
pTC[c]*pfC[c]/like;
02062
02063
if (p!=0){
02064
nES.
incr(e,s,nb*p);
02065
nS[s]+=nb*p;
02066 chk_up_pes+=nb*p;
02067 chk_up_ps+=nb*p;
02068
02069 }
02070 }
02071 }
02072
02073 pws =
pFS.
get(f,s);
02074
if (pws!=0){
02075
02076
for ( ssit = ss_admAnc.begin(); ssit != ss_admAnc.end(); ++ssit){
02077 c = *ssit;
02078
02079
02080
if(ss_anc.
find(c)==ss_anc.
end())
continue;
02081 p =
pC[c]*pws*
pS[s]/
pTC[c]*peC[c]/like;
02082
if (p!=0){
02083
nFS.
incr(f,s,nb*p);
02084
nS[s]+=nb*p;
02085
02086 chk_up_pfs+=nb*p;
02087 chk_up_ps+=nb*p;
02088 }
02089 }
02090 }
02091 }
02092
if (chk_up_pfs-nb>
PROB_PREC || chk_up_pes-nb>
PROB_PREC )
PLERROR(
"compute_BN_likelihood : inconsistent update for chk_pES = %f or chk_pFS = %f instead of %f",chk_up_pes,chk_up_pfs,nb);
02093
if (chk_up_ps-2*nb>
PROB_PREC)
PLERROR(
"compute_BN_likelihood : inconsistent update for chk_ps = %f instead of %f",chk_up_ps,nb);
02094
02095 }
02096 }
02097
02098
02099
02100
02101 sumpost=0;
02102
if (like!=0){
02103
02104
Set e_senses =
ontology.
getWordSenses(e);
02105
for (sit = e_senses.
begin(); sit != e_senses.
end(); ++sit){
02106 post=0;
02107 se = *sit;
02108 ss_anc =
ontology.
getSynsetAncestors(se);
02109
02110
if (
pTC[se]!=0){
02111 ss_anc.
insert(se);
02112 }
02113 ss_adm = commNode(e,f);
02114 ss_admAnc.clear();
02115 set_intersection(ss_anc.
begin(),ss_anc.
end(),ss_adm.
begin(),ss_adm.
end(),inserter( ss_admAnc, ss_admAnc.begin() ));
02116 pws =
pES.
get(e,se);
02117
if (pws!=0){
02118
02119
for ( ssit = ss_admAnc.begin(); ssit != ss_admAnc.end(); ++ssit){
02120 c = *ssit;
02121
02122
02123
if(ss_anc.
find(c)==ss_anc.
end())
continue;
02124 post +=
pC[c]*pws*
pS[se]/
pTC[c]*pfC[c]/like;
02125
02126 }
02127 }
02128
if(post!=0){
02129
nSEbi.
incr(se,e,post);
02130 sumpost+=post;
02131 }
02132 }
02133
if (sumpost-1.0>
PROB_PREC)
PLERROR(
"Bitext Entropy computation : sum posterior %f != 1.0",sumpost);
02134 }
02135
02136
return like;
02137 }
02138
02139 void GraphicalBiText::compute_train_likelihood(
string name)
02140 {
02141
compute_likelihood(
train_bitext_src,
train_bitext_tgt,name,1);
02142 }
02143
02144 void GraphicalBiText::compute_valid_likelihood(
string name)
02145 {
02146
compute_likelihood(
valid_bitext_src,
valid_bitext_tgt,name,0);
02147 }
02148
02149 void GraphicalBiText::compute_likelihood(
Vec bitext_src,
Vec bitext_tgt,
string name,
bool update)
02150 {
02151
02152
real join_event_number=0;
02153
real indep_event_number=0;
02154
real bn_event_number=0;
02155
real bn_like;
02156
real indep_like;
02157
real join_like;
02158
real join_log_likelihood = 0.0;
02159
real smoothed_join_log_likelihood = 0.0;
02160
real indep_log_likelihood =0.0;
02161
real bn_log_likelihood =0.0;
02162
real smoothed_bn_log_likelihood =0.0;
02163
02164
real sum_s,sum_es,sum_fs;
02165
real up_proba;
02166
int i;
02167
int e,f,s,c;
02168
SetIterator sit,ssit;
02169
02170
int nb_trans_pairs=0;
02171
ProbSparseMatrix ef_occur;
02172
real nb_occu;
02173
02174
02175 ef_occur.
resize(
source_wsd_voc_size,
target_wsd_voc_size);
02176 ef_occur.
setName(
"ef_occur");ef_occur.
setMode(
COLUMN_WISE);
02177
02178
02179
02180
02181
02182 join_log_likelihood = 0.0;
02183 indep_log_likelihood =0.0;
02184 bn_log_likelihood =0.0;
02185 join_event_number=0;
02186 indep_event_number=0;
02187 bn_event_number=0;
02188
02189
if (
update){
02190
nA.
clear();
02191
nS.
clear();
02192
nES.
clear();
02193
nFS.
clear();
02194
nSEbi.
clear();
02195 }
02196
02197
02198
02199
02200
02201
ShellProgressBar progress(0,bitext_src.
size(),
"Computing_likelihood_phase1_"+name, 50);
02202 progress.
init();
02203 progress.
draw();
02204
02205
for (i=0;i<bitext_src.
size() ;i++){
02206 e = (
int)bitext_src[i];
02207 f = (
int)bitext_tgt[i];
02208
02209
if(
ontology.
isWord(e) &&
target_wsd_voc.
find(f)!=
target_wsd_voc.
end()){
02210 ef_occur.
incr(e,f);
02211 nb_trans_pairs++;
02212 }
02213 progress.
update(i);
02214 }
02215 cout << nb_trans_pairs <<
" translation_pairs_found"<<
endl;
02216 progress.
done();
02217 progress.
set(0,ef_occur.
getWidth(),
"Computing_likelihood_phase2_"+name, 50);
02218 progress.
init();
02219 progress.
draw();
02220
02221
for (
int f = 0; f< ef_occur.
getWidth(); f++){
02222 map<int, real>& col_j = ef_occur.
getCol(f);
02223
for (map<int, real>::iterator it = col_j.begin(); it != col_j.end(); ++it){
02224 e = (
int)it->first;
02225 nb_occu = it->second;
02226
02227 indep_like =
pE[e]*
pF[f];
02228 indep_log_likelihood += nb_occu*
log(indep_like);
02229 indep_event_number+= nb_occu;
02230
02231
02232 bn_like=
compute_BN_likelihood(e,f,
update,nb_occu);
02233
if (bn_like>1.0+
PROB_PREC){
PLERROR(
"Compute_likelihood : BN proba > 1 for %d (%s) %d (%s) ",e,(
source_id_to_word[e]).
c_str(),f,(
target_id_to_word[f]).
c_str());}
02234
if (bn_like!=0){
02235 bn_log_likelihood += nb_occu*
log(bn_like);
02236 bn_event_number+=nb_occu;
02237 }
02238 smoothed_bn_log_likelihood +=
log(
alpha_bn*bn_like+(1-
alpha_bn)*indep_like);
02239
02240
02241 join_like =
pEF.
get(e,f);
02242
if (join_like!=0){
02243 join_log_likelihood += nb_occu*
log(join_like);
02244 join_event_number+= nb_occu;
02245 }
02246 smoothed_join_log_likelihood +=
log(
alpha_joint*join_like+(1-
alpha_joint)*indep_like);
02247
02248 }
02249 progress.
update(f);
02250 }
02251 progress.
done();
02252
02253
02254 cout << name+
" indep \t/ ll = " << indep_log_likelihood <<
" \t/ token = " << indep_event_number <<
" \t/ smoothed : "<< indep_log_likelihood <<
" \t/ perp = " <<
safeexp(-indep_log_likelihood / indep_event_number) <<
" \t/ smoothed : " <<
safeexp(-indep_log_likelihood / indep_event_number)<<
endl;
02255 cout << name+
" joint \t/ ll = " << join_log_likelihood <<
" \t/ token = " << join_event_number <<
" \t/ smoothed : "<< smoothed_join_log_likelihood <<
" \t/ perp = " <<
safeexp(-join_log_likelihood /join_event_number ) <<
" \t/ smoothed : " <<
safeexp(-smoothed_join_log_likelihood /indep_event_number )<<
endl;
02256 cout << name+
" BN \t/ ll = " << bn_log_likelihood <<
" \t/ token = " << bn_event_number <<
" \t/ smoothed : " << smoothed_bn_log_likelihood<<
" \t/ perp = " <<
safeexp(-bn_log_likelihood / bn_event_number) <<
" \t/ smoothed : " <<
safeexp(-smoothed_bn_log_likelihood /indep_event_number )<<
endl;
02257
02258
02259
if (
update){
02260 progress.
set(0,
ss_size,
"Update_pS_pES_pFS", 50);
02261 progress.
init();
02262 progress.
draw();
02263
02264
02265
pA.
clear();
02266
pS.
clear();
02267
pES.
clear();
02268
pFS.
clear();
02269
02270
02271
02272
02273 sum_s =
sum(
nS);
02274
02275
Set synsets=
ontology.
getAllCategories();
02276
for (sit = synsets.
begin(); sit != synsets.
end(); ++sit){
02277 s = *sit;
02278
if (
nS[s]!=0)
pS[s]=
nS[s]/sum_s;
02279 sum_es = 0;
02280
Set source_words =
ontology.
getAllWords();
02281
for (ssit = source_words.
begin(); ssit != source_words.
end(); ++ssit){
02282 e = *ssit;
02283 sum_es +=
nES.
get(e,s);
02284 }
02285
for (ssit = source_words.
begin(); ssit != source_words.
end(); ++ssit){
02286 e = *ssit;
02287 up_proba=
nES.
get(e,s);
02288
if (up_proba!=0){
02289
pES.
set(e,s,up_proba/sum_es);
02290
02291 }
02292 }
02293 sum_fs=0;
02294
for (ssit =
target_wsd_voc.
begin(); ssit !=
target_wsd_voc.
end(); ++ssit){
02295 f = *ssit;
02296 sum_fs +=
nFS.
get(f,s);
02297 }
02298
for (ssit =
target_wsd_voc.
begin(); ssit !=
target_wsd_voc.
end(); ++ssit){
02299 f = *ssit;
02300 up_proba =
nFS.
get(f,s);
02301
if (up_proba!=0){
02302
02303
pFS.
set(f,s,up_proba/sum_fs);
02304 }
02305 }
02306 progress.
update(s);
02307 }
02308
compute_pTC();
02309
02310
02311 synsets=
ontology.
getAllCategories();
02312
for (sit = synsets.
begin(); sit != synsets.
end(); ++sit){
02313 c = *sit;
02314
if(
nA[c]!=0){
02315
pA[c]=
nA[c]/bn_event_number;
02316 }
02317 }
02318
compute_pTC();
02319
check_set_pA();
02320
compute_pMC();
02321
02322
02323 progress.
done();
02324 }
02325
pSEbi.
clear();
02326
02327
pSEbi.
normalizeCond(
nSEbi,
false);
02328
02329 }
02330
02331 void GraphicalBiText::computeKL()
02332 {
02333
int e;
02334
SetIterator sit;
02335
Set e_words=
ontology.
getAllWords();
02336
real kl,skl;
02337
for (sit = e_words.
begin(); sit != e_words.
end(); ++sit){
02338 e = *sit;
02339 kl=0;
02340
if (
pSEbi.
sumCol(e)==0 ||
pSE.
sumCol(e)==0)
continue;
02341 map<int, real>& col_e =
pSE.
getCol(e);
02342
02343
for (map<int, real>::iterator mit = col_e.begin(); mit != col_e.end(); ++mit){
02344
02345 skl=
pSEbi.
get(mit->first,e)*
safeflog2(
pSEbi.
get(mit->first,e)/mit->second);
02346
if (!isnan(skl))kl+=skl;
02347 }
02348
02349
KL[e]=kl;
02350 }
02351 }
02352
02353
02354
02355 void GraphicalBiText::loadSensemap(
string sensemap_file)
02356 {
02357
int nbMap=0;
02358
02359 cout <<
"Loading sensemap : ";
02360 ifstream sensemap_stream(sensemap_file.c_str());
02361
string line;
02362
vector<string> tokens;
02363
if(sensemap_stream.is_open()){
02364
while(!sensemap_stream.eof()){
02365 line =
pgetline(sensemap_stream);
02366
if (line==
"")
continue;
02367 tokens =
split(line,
" ");
02368
if (tokens.size()>1){
02369 nbMap++;
02370
sensemap[tokens[0]]=tokens[2];
02371 }
else{
02372
sensemap[tokens[0]]=tokens[0];
02373 }
02374 }
02375 }
02376 cout << nbMap <<
" sense mappings found\n";
02377
02378 }
02379
02380 void GraphicalBiText::train(
VMat training_set)
02381 {
02382
02383
02384
TVec<string> our_answers1(
wsd_train.
length());
02385
02386
real interp_max = 1;
02387
real interp_min = 1;
02388
real interp_step = 0.4;
02389
02390
02391
02392
for(
real interp=interp_min;interp<=interp_max;interp+=interp_step){
02393
test_WSD(
wsd_train,
"Semcor_train_set_epoch_0_"+
tostring(interp), our_answers1,0,interp);
02394
test_WSD(
wsd_valid,
"Semcor_valid1_set_epoch_0_"+
tostring(interp), our_answers1,0,interp);
02395
test_WSD(
wsd_valid2,
"Semcor_valid2_set_epoch_0_"+
tostring(interp),our_answers1,0,interp);
02396
test_WSD(
wsd_test,
"Semcor_test_set_epoch_0_"+
tostring(interp),our_answers1,0,interp);
02397
02398 }
02399
02400
02401
02402
02403
for (
int i=1;i<
n_epoch;i++){
02404
compute_train_likelihood(
"Train_set_epoc "+
tostring(i));
02405
02406
compute_valid_likelihood(
"Valid_set_epoc "+
tostring(i));
02407
02408
02409
update_WSD_model(
tostring(i));
02410
check_consitency();
02411
02412
02413
for(
real interp=interp_min;interp<=interp_max;interp+=interp_step){
02414
test_WSD(
wsd_train,
"Semcor_train_set_epoch_"+
tostring(i)+
"_"+
tostring(interp), our_answers1,0,interp);
02415
test_WSD(
wsd_valid,
"Semcor_valid1_set_epoch_"+
tostring(i)+
"_"+
tostring(interp), our_answers1,0,interp);
02416
test_WSD(
wsd_valid2,
"Semcor_valid2_set_epoch_"+
tostring(i)+
"_"+
tostring(interp), our_answers1,0,interp);
02417
test_WSD(
wsd_test,
"Semcor_test_set_epoch_"+
tostring(i)+
"_"+
tostring(interp),our_answers1,0,interp);
02418
02419 }
02420
02421
02422
02423 }
02424
02425 }
02426
02427 void GraphicalBiText::test()
02428 {
02429
02430 }
02431
02432 void GraphicalBiText::setTrainingSet(
VMat training_set,
bool call_forget)
02433 {
02434
02435 }
02436
02437
02438 }
02439