00001
00003
#include "TextSenseSequenceVMatrix.h"
00004
00005
namespace PLearn {
00006
using namespace std;
00007
00008
00009 TextSenseSequenceVMatrix::TextSenseSequenceVMatrix()
00010 :
inherited(),window_size(0), is_supervised_data(true), res_pos(
TVec<
int>(0)), rand_syn(false), wno(NULL), keep_in_sentence(false), undefined_pos_set(false)
00011
00012 {
00013
00014
00015
00016 }
00017
00018
PLEARN_IMPLEMENT_OBJECT(
TextSenseSequenceVMatrix,
00019
"VMat class that takes another VMat which contains a sequence (rows) "
00020
"of words/sense/POS triplets extracted from a corpus and implements a "
00021
"representation of a target word and its context.",
00022
"");
00023
00024 void TextSenseSequenceVMatrix::getNewRow(
int i,
const Vec& v)
const
00025
{
00026
00027
if(
res_pos.
size() != 0)
00028 {
00029
getRestrictedRow(i,v);
00030
return;
00031 }
00032
00033
if(i >=
dvm->
length() || i < 0)
00034
PLERROR(
"In TextSenseSequenceVMatrix: requesting %dth row of matrix of length %d", i,
dvm->
length());
00035
if(v.
length() != 3*(
window_size+1))
00036
PLERROR(
"In TextSenseSequenceVMatrix: getNewRow v.length() must be equal to VMat's width");
00037
00038
00039
00040
if(i ==
my_current_row_index)
00041 {
00042
for(
int j=0; j<
my_current_row.
size(); j++)
00043 v[j] =
my_current_row[j];
00044
if(
dvm->
width() == 3 &&
rand_syn)
00045
permute(v);
00046
for(
int j=0; j<my_current_row.size(); j++)
00047 my_current_row[j] = v[j];
00048
return;
00049 }
00050
00051
00052
00053
int context_dist = -(
window_size/2);
00054
int context_count = 0;
00055
while(context_dist <=
window_size/2)
00056 {
00057
int context_dist_i = context_dist+i;
00058
int dist_my_current_row_index = (context_dist_i) -
my_current_row_index;
00059
if(
my_current_row_index != -1 && dist_my_current_row_index >= -
window_size/2 && dist_my_current_row_index <=
window_size/2)
00060 {
00061
int index = -1;
00062
if(dist_my_current_row_index == 0)
00063 index =
window_size;
00064
if(dist_my_current_row_index < 0)
00065 index = window_size/2 + dist_my_current_row_index;
00066
if(dist_my_current_row_index > 0)
00067 index = window_size/2 + dist_my_current_row_index - 1;
00068
00069
if(context_dist != 0)
00070 {
00071 v[3*context_count] =
my_current_row[3*index];
00072 v[3*context_count+1] = my_current_row[3*index+1];
00073 v[3*context_count+2] = my_current_row[3*index+2];
00074 context_count++;
00075 }
00076
else
00077 {
00078 v[3*window_size] =
my_current_row[3*index];
00079 v[3*window_size+1] = my_current_row[3*index+1];
00080 v[3*window_size+2] = my_current_row[3*index+2];
00081 }
00082 }
00083
else
00084
if(context_dist_i < 0 || context_dist_i >=
dvm->
length())
00085 {
00086 v[3*context_count] = 0;
00087 v[3*context_count+1] =
UNDEFINED_SS_ID;
00088 v[3*context_count+2] =
undefined_pos_set ?
undefined_pos :
UNDEFINED_TYPE;
00089 context_count++;
00090 }
00091
else
00092 {
00093
if(context_dist == 0)
00094 {
00095
if(
is_supervised_data)
00096 {
00097
Vec temp(3);
00098
dvm->getRow(i, temp);
00099
if(temp[0] ==
SYNSETTAG_ID)
00100 {
00101 temp[0] = 0;
00102 temp[1] =
UNDEFINED_SS_ID;
00103 temp[2] =
undefined_pos_set ?
undefined_pos :
UNDEFINED_TYPE;
00104 }
00105 v[3*
window_size] = temp[0];
00106 v[3*
window_size+1] = temp[1];
00107 v[3*
window_size+2] = temp[2];
00108 }
00109
else
00110 {
00111
Vec temp(2);
00112
dvm->getRow(i, temp);
00113
if(
SYNSETTAG_ID == temp[0])
00114 {
00115 temp[0] = 0;
00116 temp[1] =
undefined_pos_set ?
undefined_pos :
UNDEFINED_TYPE;
00117 }
00118 v[3*
window_size] = temp[0];
00119 v[3*
window_size+1] =
UNDEFINED_SS_ID;
00120 v[3*
window_size+2] = temp[1];
00121 }
00122 context_dist++;
00123
continue;
00124 }
00125
00126
if(
is_supervised_data)
00127 {
00128
Vec temp(3);
00129
dvm->getRow(context_dist_i, temp);
00130
if(temp[0] ==
SYNSETTAG_ID)
00131 {
00132 temp[0] = 0;
00133 temp[1] =
UNDEFINED_SS_ID;
00134 temp[2] =
undefined_pos_set ?
undefined_pos :
UNDEFINED_TYPE;
00135 }
00136 v[3*context_count] = temp[0];
00137 v[3*context_count+1] = temp[1];
00138 v[3*context_count+2] = temp[2];
00139 }
00140
else
00141 {
00142
Vec temp(2);
00143
dvm->getRow(context_dist_i, temp);
00144
if(
SYNSETTAG_ID == temp[0])
00145 {
00146 temp[0] = 0;
00147 temp[1] =
undefined_pos_set ?
undefined_pos :
UNDEFINED_TYPE;
00148 }
00149 v[3*context_count] = temp[0];
00150 v[3*context_count+1] =
UNDEFINED_SS_ID;
00151 v[3*context_count+2] = temp[1];
00152 }
00153 context_count++;
00154 }
00155 context_dist++;
00156 }
00157
00158
if(context_count !=
window_size)
00159
PLERROR(
"What the hell!!!");
00160
00161
if(
dvm->
width() == 3 &&
rand_syn)
00162
permute(v);
00163
00164
my_current_row_index = i;
00165
for(
int j=0; j<
my_current_row.
size(); j++)
00166
my_current_row[j] = v[j];
00167
00168
if(
keep_in_sentence)
apply_boundary(v);
00169 }
00170
00171 int TextSenseSequenceVMatrix::getRestrictedRow(
const int i,
Vec v)
const
00172
{
00173
00174
if(i >=
dvm->
length() || i < 0)
00175
PLERROR(
"In TextSenseSequenceVMatrix: requesting %dth row of matrix of length %d", i,
dvm.
length());
00176
if(v.
length() != 3*(
window_size+1))
00177
PLERROR(
"In TextSenseSequenceVMatrix: getRestrictedRow v.length() must be equal to VMat's width");
00178
00179
00180
00181
for(
int j=0; j<
window_size; j++)
00182 {
00183 v[3*j] = 0;
00184 v[3*j+1] =
UNDEFINED_SS_ID;
00185 v[3*j+2] =
undefined_pos_set ?
undefined_pos :
UNDEFINED_TYPE;
00186 }
00187
00188
00189
00190
if(
is_supervised_data)
00191 {
00192
Vec temp(3);
00193
dvm->getRow(i, temp);
00194
if(
SYNSETTAG_ID == temp[0])
00195 {
00196 temp[0] = 0;
00197 temp[1] =
UNDEFINED_SS_ID;
00198 temp[2] =
undefined_pos_set ?
undefined_pos :
UNDEFINED_TYPE;
00199 }
00200 v[3*window_size] = temp[0];
00201 v[3*window_size+1] = temp[1];
00202 v[3*window_size+2] = temp[2];
00203 }
00204
else
00205 {
00206
Vec temp(2);
00207
dvm->getRow(i, temp);
00208
if(
SYNSETTAG_ID == temp[0])
00209 {
00210 temp[0] = 0;
00211 temp[1] =
undefined_pos_set ?
undefined_pos :
UNDEFINED_TYPE;
00212 }
00213 v[3*window_size] = temp[0];
00214 v[3*window_size+1] =
UNDEFINED_SS_ID;
00215 v[3*window_size+2] = temp[1];
00216 }
00217
00218
00219
00220
00221
int context_dist = -1;
00222
int context_found = 0;
00223
00224
while(context_found != window_size/2 && context_dist+i >=0)
00225 {
00226
if(
is_supervised_data)
00227 {
00228
Vec temp(3);
00229
dvm->getRow(context_dist+i, temp);
00230
if(temp[0] ==
SYNSETTAG_ID)
00231
break;
00232
if(!
res_pos.
contains((
int)temp[2]))
00233 {
00234 context_found++;
00235
int index = window_size/2 - context_found;
00236 v[3*index] = temp[0];
00237 v[3*index+1] = temp[1];
00238 v[3*index+2] = temp[2];
00239 }
00240 }
00241
else
00242 {
00243
Vec temp(2);
00244
dvm->getRow(context_dist+i, temp);
00245
if(temp[0] ==
SYNSETTAG_ID)
00246
break;
00247
if(!
res_pos.
contains((
int)temp[1]))
00248 {
00249 context_found++;
00250
int index = window_size/2 - context_found;
00251 v[3*index] = temp[0];
00252 v[3*index+1] =
UNDEFINED_SS_ID;;
00253 v[3*index+2] = temp[1];
00254 }
00255 }
00256 context_dist--;
00257 }
00258
00259
00260
00261 context_dist = 1;
00262 context_found = window_size/2;
00263
00264
while(context_found != window_size && context_dist+i <
dvm->
length())
00265 {
00266
if(
is_supervised_data)
00267 {
00268
Vec temp(3);
00269
dvm->getRow(context_dist+i, temp);
00270
if(temp[0] ==
SYNSETTAG_ID)
00271
break;
00272
if(!
res_pos.
contains((
int)temp[2]))
00273 {
00274
int index = context_found;
00275 context_found++;
00276 v[3*index] = temp[0];
00277 v[3*index+1] = temp[1];
00278 v[3*index+2] = temp[2];
00279 }
00280 }
00281
else
00282 {
00283
Vec temp(2);
00284
dvm->getRow(context_dist+i, temp);
00285
if(temp[0] ==
SYNSETTAG_ID)
00286
break;
00287
if(!
res_pos.
contains((
int)temp[1]))
00288 {
00289
int index = context_found;
00290 context_found++;
00291 v[3*index] = temp[0];
00292 v[3*index+1] =
UNDEFINED_SS_ID;;
00293 v[3*index+2] = temp[1];
00294 }
00295 }
00296 context_dist++;
00297 }
00298
00299
00300
00301 context_found = 0;
00302
while(context_found != window_size/2+1 && context_dist+i <
dvm->
length())
00303 {
00304
if(
is_supervised_data)
00305 {
00306
Vec temp(3);
00307
dvm->getRow(context_dist+i, temp);
00308
if(temp[0] ==
SYNSETTAG_ID)
00309 {
00310 context_dist++;
00311
continue;
00312 }
00313
if(!
res_pos.
contains((
int)temp[2]))
00314 context_found++;
00315 }
00316
else
00317 {
00318
Vec temp(2);
00319
dvm->getRow(context_dist+i, temp);
00320
if(temp[0] ==
SYNSETTAG_ID)
00321 {
00322 context_dist++;
00323
continue;
00324 }
00325
if(!
res_pos.
contains((
int)temp[1]))
00326 context_found++;
00327 }
00328 context_dist++;
00329 }
00330
00331
00332
if(
dvm->
width() == 3 &&
rand_syn)
00333
permute(v);
00334
00335
my_current_row_index = i;
00336
for(
int j=0; j<
my_current_row.
size(); j++)
00337
my_current_row[j] = v[j];
00338
00339
if(
keep_in_sentence)
apply_boundary(v);
00340
00341
return context_dist+i ==
dvm->
length() ? context_dist+i : context_dist+i-1;
00342 }
00343
00344 void TextSenseSequenceVMatrix::apply_boundary(
const Vec& v)
const
00345
{
00346
00347
00348
bool found_boundary =
false;
00349
for(
int i=
window_size/2-1; i>=0; i--)
00350 {
00351
if(v[3*i] ==
sentence_boundary) found_boundary =
true;
00352
if(found_boundary)
00353 {
00354 v[3*i] = 0;
00355 v[3*i+1] =
UNDEFINED_SS_ID;
00356
if(
undefined_pos_set) v[3*i+2] =
undefined_pos;
00357
else v[3*i+2] =
UNDEFINED_TYPE;
00358 }
00359 }
00360
00361
00362
00363 found_boundary =
false;
00364
for(
int i=
window_size/2; i<
window_size; i--)
00365 {
00366
if(v[3*i] ==
sentence_boundary) found_boundary =
true;
00367
if(found_boundary)
00368 {
00369 v[3*i] = 0;
00370 v[3*i+1] =
UNDEFINED_SS_ID;
00371
if(
undefined_pos_set) v[3*i+2] =
undefined_pos;
00372
else v[3*i+2] =
UNDEFINED_TYPE;
00373 }
00374 }
00375 }
00376
00377 void TextSenseSequenceVMatrix::permute(
Vec v)
const
00378
{
00379
for(
int i=0; i<
window_size+1; i++)
00380 {
00381
int pos = (
int)v[3*i+2];
00382
if(pos ==
NOUN_TYPE || pos ==
VERB_TYPE || pos ==
ADJ_TYPE || pos ==
ADV_TYPE)
00383 {
00384
real rand =
uniform_sample();
00385
real sum = 0;
00386
int j=0;
00387
int sense = (
int)v[3*i+1];
00388
int word_id = (
int)v[3*i];
00389
if(sense >= 0 && word_id >= 0)
00390 {
00391
for(; j<
word_given_sense_priors[sense].
size(); j++)
00392 {
00393
if(rand <
sum + word_given_sense_priors[sense][j].second)
00394
break;
00395
sum += word_given_sense_priors[sense][j].second;
00396 }
00397
string word =
wno->
getWord(word_given_sense_priors[sense][j].first);
00398
string stemmed_syn =
stemWord(word, pos);
00399
int syn_word_id =
wno->
getWordId(stemmed_syn);
00400
if(syn_word_id != -1)
00401 {
00402
TVec<int> senses_of_target_word;
00403
switch (pos)
00404 {
00405
case NOUN_TYPE:
00406 senses_of_target_word =
wno->
temp_word_to_noun_senses[syn_word_id];
00407
break;
00408
case VERB_TYPE:
00409 senses_of_target_word =
wno->
temp_word_to_verb_senses[syn_word_id];
00410
break;
00411
case ADJ_TYPE:
00412 senses_of_target_word =
wno->
temp_word_to_adj_senses[syn_word_id];
00413
break;
00414
case ADV_TYPE:
00415 senses_of_target_word =
wno->
temp_word_to_adv_senses[syn_word_id];
00416
break;
00417
case UNDEFINED_TYPE:
00418 senses_of_target_word =
wno->
getSensesForWord(syn_word_id);
00419
break;
00420
default:
00421
00422 senses_of_target_word =
wno->
getSensesForWord(syn_word_id);
00423 }
00424
00425
int k=0;
00426
while(
k<senses_of_target_word.
size())
00427 {
00428
if(senses_of_target_word[
k] == (
int)v[3*i+1])
00429
break;
00430
k++;
00431 }
00432
if(
k != senses_of_target_word.
size())
00433 v[3*i] = syn_word_id;
00434 }
00435 }
00436 }
00437 }
00438 }
00439
00440 void TextSenseSequenceVMatrix::declareOptions(
OptionList& ol)
00441 {
00442
declareOption(ol,
"window_size", &TextSenseSequenceVMatrix::window_size, OptionBase::buildoption,
"Size of the context window");
00443
declareOption(ol,
"is_supervised_data", &TextSenseSequenceVMatrix::is_supervised_data, OptionBase::buildoption,
"Data of VMatrix is supervised");
00444
declareOption(ol,
"res_pos", &TextSenseSequenceVMatrix::res_pos, OptionBase::buildoption,
"TVec<int> containing the POSs of the words which should not be included in the target word context");
00445
declareOption(ol,
"dvm", &TextSenseSequenceVMatrix::dvm, OptionBase::buildoption,
"VMatrix that contains the triplets word/sense/POS of a corpus");
00446
declareOption(ol,
"rand_syn", &TextSenseSequenceVMatrix::rand_syn, OptionBase::buildoption,
"Use same-sense random permutation of words");
00447
declareOption(ol,
"keep_in_sentence", &TextSenseSequenceVMatrix::keep_in_sentence, OptionBase::buildoption,
"Indication that the context must not spread over another sentence");
00448
declareOption(ol,
"sentence_boundary", &TextSenseSequenceVMatrix::sentence_boundary, OptionBase::buildoption,
"Sentence boundary symbol");
00449
declareOption(ol,
"undefined_pos_set", &TextSenseSequenceVMatrix::undefined_pos_set, OptionBase::buildoption,
"Indication that the undefined pos id is defined");
00450
declareOption(ol,
"undefined_pos", &TextSenseSequenceVMatrix::undefined_pos, OptionBase::buildoption,
"Undefined pos id");
00451 inherited::declareOptions(ol);
00452 }
00453
00454 void TextSenseSequenceVMatrix::build_()
00455 {
00456
if(
window_size%2 != 0)
00457
PLERROR(
"In TextSenseSequenceVMatrix: window_size must be even number");
00458
if(
window_size < 0)
00459
PLERROR(
"In TextSenseSequenceVMatrix: window_size must be non negative");
00460
if(
dvm->
width() != 2 &&
dvm->
width() != 3)
00461
PLERROR(
"In TextSenseSequenceVMatrix: VMat that_dvm should have width equal to 2 or 3");
00462
00463 width_ = 3*(
window_size+1);
00464 length_ =
dvm->
length();
00465 fieldinfos.
resize(width_);
00466
00467
00468
00469
00470
if(
dvm->
width() == 2 &&
rand_syn)
00471
PLWARNING(
"In TextSenseSequenceVMatrix: cannot use permutation of same-sense words with unsupervised data");
00472
00473
if(
dvm.
isNull())
00474
PLERROR(
"In TextSenseSequenceVMatrix: dvm (data of the matrix) is not defined");
00475
if(
dvm->
width() == 3 &&
rand_syn)
00476 {
00477
if(
wno == NULL)
00478
PLERROR(
"In TextSenseSequence: there is no WordNetOntology defined");
00479
word_given_sense_priors.
resize(
wno->
getSenseSize());
00480
00481
for(
int i=0; i<
word_given_sense_priors.
size(); i++)
00482 {
00483
Set words_for_sense =
wno->
getWordsForSense(i);
00484
int n_words_for_sense = words_for_sense.
size();
00485
word_given_sense_priors[i]->
resize(n_words_for_sense, 1);
00486
int j=0;
00487
for(
SetIterator sit = words_for_sense.
begin(); sit != words_for_sense.
end(); sit++,j++)
00488 {
00489
word_given_sense_priors[i][j].
first = *sit;
00490
word_given_sense_priors[i][j].second = 1;
00491 }
00492 }
00493
00494
if(
dvm.
isNull())
00495
PLERROR(
"In TextSenseSequenceVMatrix: dvm (data of the matrix) is not defined");
00496
Vec triplet(3);
00497
for(
int i=0; i<
dvm.
length(); i++)
00498 {
00499
dvm->getRow(i, triplet);
00500
int sense = (
int)triplet[1];
00501
int word = (
int )triplet[0];
00502
if(sense >= 0 && word >= 0)
00503 {
00504
int size =
word_given_sense_priors[sense].
size();
00505
for(
int j=0; j<size;j++)
00506
if(word == word_given_sense_priors[sense][j].first)
00507 {
00508 word_given_sense_priors[sense][j].second += word_given_sense_priors[sense][j].second == 1 ? 1 : 2;
00509
break;
00510 }
00511 }
00512 }
00513
00514
00515
for(
int i=0; i<
word_given_sense_priors.
size(); i++)
00516 {
00517
real sum = 0;
00518
for(
int j=0; j<
word_given_sense_priors[i].
size(); j++)
00519
sum += word_given_sense_priors[i][j].second;
00520
if(
sum != 0)
00521
for(
int j=0; j<word_given_sense_priors[i].size(); j++)
00522 word_given_sense_priors[i][j].second /=
sum;
00523 }
00524 }
00525 }
00526
00527 void TextSenseSequenceVMatrix::build()
00528 {
00529 inherited::build();
00530
build_();
00531 }
00532
00533 void TextSenseSequenceVMatrix::makeDeepCopyFromShallowCopy(map<const void*, void*>& copies)
00534 {
00535 inherited::makeDeepCopyFromShallowCopy(copies);
00536
deepCopyField(
dvm, copies);
00537
deepCopyField(
res_pos,copies);
00538
00539
00540 }
00541
00542 }
00543