PLearn: TextSenseSequenceVMatrix.cc Source File

00001 00003 #include "TextSenseSequenceVMatrix.h" 00004 00005 namespace PLearn { 00006 using namespace std; 00007 00008 00009 TextSenseSequenceVMatrix::TextSenseSequenceVMatrix() 00010 :inherited(),window_size(0), is_supervised_data(true), res_pos(TVec<int>(0)), rand_syn(false), wno(NULL), keep_in_sentence(false), undefined_pos_set(false) 00011 /* ### Initialise all fields to their default value */ 00012 { 00013 00014 // ### You may or may not want to call build_() to finish building the object 00015 //build_(); 00016 } 00017 00018 PLEARN_IMPLEMENT_OBJECT(TextSenseSequenceVMatrix, 00019 "VMat class that takes another VMat which contains a sequence (rows) " 00020 "of words/sense/POS triplets extracted from a corpus and implements a " 00021 "representation of a target word and its context.", 00022 ""); 00023 00024 void TextSenseSequenceVMatrix::getNewRow(int i, const Vec& v) const 00025 { 00026 00027 if(res_pos.size() != 0) 00028 { 00029 getRestrictedRow(i,v); 00030 return; 00031 } 00032 00033 if(i >= dvm->length() || i < 0) 00034 PLERROR("In TextSenseSequenceVMatrix: requesting %dth row of matrix of length %d", i, dvm->length()); 00035 if(v.length() != 3*(window_size+1)) 00036 PLERROR("In TextSenseSequenceVMatrix: getNewRow v.length() must be equal to VMat's width"); 00037 00038 // Fetch context already in memory 00039 00040 if(i == my_current_row_index) 00041 { 00042 for(int j=0; j<my_current_row.size(); j++) 00043 v[j] = my_current_row[j]; 00044 if(dvm->width() == 3 && rand_syn) 00045 permute(v); 00046 for(int j=0; j<my_current_row.size(); j++) 00047 my_current_row[j] = v[j]; 00048 return; 00049 } 00050 00051 // Fetch context not found in memory 00052 00053 int context_dist = -(window_size/2); 00054 int context_count = 0; 00055 while(context_dist <= window_size/2) 00056 { 00057 int context_dist_i = context_dist+i; 00058 int dist_my_current_row_index = (context_dist_i) - my_current_row_index; 00059 if(my_current_row_index != -1 && dist_my_current_row_index >= - window_size/2 && dist_my_current_row_index <= window_size/2) 00060 { 00061 int index = -1; 00062 if(dist_my_current_row_index == 0) 00063 index = window_size; 00064 if(dist_my_current_row_index < 0) 00065 index = window_size/2 + dist_my_current_row_index; 00066 if(dist_my_current_row_index > 0) 00067 index = window_size/2 + dist_my_current_row_index - 1; 00068 00069 if(context_dist != 0) 00070 { 00071 v[3*context_count] = my_current_row[3*index]; 00072 v[3*context_count+1] = my_current_row[3*index+1]; 00073 v[3*context_count+2] = my_current_row[3*index+2]; 00074 context_count++; 00075 } 00076 else 00077 { 00078 v[3*window_size] = my_current_row[3*index]; 00079 v[3*window_size+1] = my_current_row[3*index+1]; 00080 v[3*window_size+2] = my_current_row[3*index+2]; 00081 } 00082 } 00083 else 00084 if(context_dist_i < 0 || context_dist_i >= dvm->length()) 00085 { 00086 v[3*context_count] = 0; //oov_tag_id : should'nt be handcoded; 00087 v[3*context_count+1] = UNDEFINED_SS_ID; 00088 v[3*context_count+2] = undefined_pos_set ? undefined_pos : UNDEFINED_TYPE; 00089 context_count++; 00090 } 00091 else 00092 { 00093 if(context_dist == 0) 00094 { 00095 if(is_supervised_data) 00096 { 00097 Vec temp(3); 00098 dvm->getRow(i, temp); 00099 if(temp[0] == SYNSETTAG_ID) 00100 { 00101 temp[0] = 0; //oov_tag_id : should'nt be handcoded; 00102 temp[1] = UNDEFINED_SS_ID; 00103 temp[2] = undefined_pos_set ? undefined_pos : UNDEFINED_TYPE; 00104 } 00105 v[3*window_size] = temp[0]; 00106 v[3*window_size+1] = temp[1]; 00107 v[3*window_size+2] = temp[2]; 00108 } 00109 else 00110 { 00111 Vec temp(2); 00112 dvm->getRow(i, temp); 00113 if(SYNSETTAG_ID == temp[0]) 00114 { 00115 temp[0] = 0; //oov_tag_id : should'nt be handcoded; 00116 temp[1] = undefined_pos_set ? undefined_pos : UNDEFINED_TYPE; 00117 } 00118 v[3*window_size] = temp[0]; 00119 v[3*window_size+1] = UNDEFINED_SS_ID; 00120 v[3*window_size+2] = temp[1]; 00121 } 00122 context_dist++; 00123 continue; 00124 } 00125 00126 if(is_supervised_data) 00127 { 00128 Vec temp(3); 00129 dvm->getRow(context_dist_i, temp); 00130 if(temp[0] == SYNSETTAG_ID) 00131 { 00132 temp[0] = 0; //oov_tag_id : should'nt be handcoded; 00133 temp[1] = UNDEFINED_SS_ID; 00134 temp[2] = undefined_pos_set ? undefined_pos : UNDEFINED_TYPE; 00135 } 00136 v[3*context_count] = temp[0]; 00137 v[3*context_count+1] = temp[1]; 00138 v[3*context_count+2] = temp[2]; 00139 } 00140 else 00141 { 00142 Vec temp(2); 00143 dvm->getRow(context_dist_i, temp); 00144 if(SYNSETTAG_ID == temp[0]) 00145 { 00146 temp[0] = 0; //oov_tag_id : should'nt be handcoded; 00147 temp[1] = undefined_pos_set ? undefined_pos : UNDEFINED_TYPE; 00148 } 00149 v[3*context_count] = temp[0]; 00150 v[3*context_count+1] = UNDEFINED_SS_ID; 00151 v[3*context_count+2] = temp[1]; 00152 } 00153 context_count++; 00154 } 00155 context_dist++; 00156 } 00157 00158 if(context_count != window_size) 00159 PLERROR("What the hell!!!"); 00160 00161 if(dvm->width() == 3 && rand_syn) 00162 permute(v); 00163 00164 my_current_row_index = i; 00165 for(int j=0; j<my_current_row.size(); j++) 00166 my_current_row[j] = v[j]; 00167 00168 if(keep_in_sentence) apply_boundary(v); 00169 } 00170 00171 int TextSenseSequenceVMatrix::getRestrictedRow(const int i, Vec v) const 00172 { 00173 00174 if(i >= dvm->length() || i < 0) 00175 PLERROR("In TextSenseSequenceVMatrix: requesting %dth row of matrix of length %d", i, dvm.length()); 00176 if(v.length() != 3*(window_size+1)) 00177 PLERROR("In TextSenseSequenceVMatrix: getRestrictedRow v.length() must be equal to VMat's width"); 00178 00179 // Initialization of context 00180 00181 for(int j=0; j<window_size; j++) 00182 { 00183 v[3*j] = 0; //oov_tag_id : should'nt be handcoded; 00184 v[3*j+1] = UNDEFINED_SS_ID; 00185 v[3*j+2] = undefined_pos_set ? undefined_pos : UNDEFINED_TYPE; 00186 } 00187 00188 // Fetch target word 00189 00190 if(is_supervised_data) 00191 { 00192 Vec temp(3); 00193 dvm->getRow(i, temp); 00194 if(SYNSETTAG_ID == temp[0]) 00195 { 00196 temp[0] = 0; //oov_tag_id : should'nt be handcoded; 00197 temp[1] = UNDEFINED_SS_ID; 00198 temp[2] = undefined_pos_set ? undefined_pos : UNDEFINED_TYPE; 00199 } 00200 v[3*window_size] = temp[0]; 00201 v[3*window_size+1] = temp[1]; 00202 v[3*window_size+2] = temp[2]; 00203 } 00204 else 00205 { 00206 Vec temp(2); 00207 dvm->getRow(i, temp); 00208 if(SYNSETTAG_ID == temp[0]) 00209 { 00210 temp[0] = 0; //oov_tag_id : should'nt be handcoded; 00211 temp[1] = undefined_pos_set ? undefined_pos : UNDEFINED_TYPE; 00212 } 00213 v[3*window_size] = temp[0]; 00214 v[3*window_size+1] = UNDEFINED_SS_ID; 00215 v[3*window_size+2] = temp[1]; 00216 } 00217 00218 00219 // Fetch words to the left 00220 00221 int context_dist = -1; 00222 int context_found = 0; 00223 00224 while(context_found != window_size/2 && context_dist+i >=0) 00225 { 00226 if(is_supervised_data) 00227 { 00228 Vec temp(3); 00229 dvm->getRow(context_dist+i, temp); 00230 if(temp[0] == SYNSETTAG_ID) 00231 break; 00232 if(!res_pos.contains((int)temp[2])) 00233 { 00234 context_found++; 00235 int index = window_size/2 - context_found; 00236 v[3*index] = temp[0]; 00237 v[3*index+1] = temp[1]; 00238 v[3*index+2] = temp[2]; 00239 } 00240 } 00241 else 00242 { 00243 Vec temp(2); 00244 dvm->getRow(context_dist+i, temp); 00245 if(temp[0] == SYNSETTAG_ID) 00246 break; 00247 if(!res_pos.contains((int)temp[1])) 00248 { 00249 context_found++; 00250 int index = window_size/2 - context_found; 00251 v[3*index] = temp[0]; 00252 v[3*index+1] = UNDEFINED_SS_ID;; 00253 v[3*index+2] = temp[1]; 00254 } 00255 } 00256 context_dist--; 00257 } 00258 00259 // Fetch words to the right 00260 00261 context_dist = 1; 00262 context_found = window_size/2; 00263 00264 while(context_found != window_size && context_dist+i < dvm->length()) 00265 { 00266 if(is_supervised_data) 00267 { 00268 Vec temp(3); 00269 dvm->getRow(context_dist+i, temp); 00270 if(temp[0] == SYNSETTAG_ID) 00271 break; 00272 if(!res_pos.contains((int)temp[2])) 00273 { 00274 int index = context_found; 00275 context_found++; 00276 v[3*index] = temp[0]; 00277 v[3*index+1] = temp[1]; 00278 v[3*index+2] = temp[2]; 00279 } 00280 } 00281 else 00282 { 00283 Vec temp(2); 00284 dvm->getRow(context_dist+i, temp); 00285 if(temp[0] == SYNSETTAG_ID) 00286 break; 00287 if(!res_pos.contains((int)temp[1])) 00288 { 00289 int index = context_found; 00290 context_found++; 00291 v[3*index] = temp[0]; 00292 v[3*index+1] = UNDEFINED_SS_ID;; 00293 v[3*index+2] = temp[1]; 00294 } 00295 } 00296 context_dist++; 00297 } 00298 00299 // Looking for next non-overlapping context 00300 00301 context_found = 0; 00302 while(context_found != window_size/2+1 && context_dist+i < dvm->length()) 00303 { 00304 if(is_supervised_data) 00305 { 00306 Vec temp(3); 00307 dvm->getRow(context_dist+i, temp); 00308 if(temp[0] == SYNSETTAG_ID) 00309 { 00310 context_dist++; 00311 continue; 00312 } 00313 if(!res_pos.contains((int)temp[2])) 00314 context_found++; 00315 } 00316 else 00317 { 00318 Vec temp(2); 00319 dvm->getRow(context_dist+i, temp); 00320 if(temp[0] == SYNSETTAG_ID) 00321 { 00322 context_dist++; 00323 continue; 00324 } 00325 if(!res_pos.contains((int)temp[1])) 00326 context_found++; 00327 } 00328 context_dist++; 00329 } 00330 00331 00332 if(dvm->width() == 3 && rand_syn) 00333 permute(v); 00334 00335 my_current_row_index = i; 00336 for(int j=0; j<my_current_row.size(); j++) 00337 my_current_row[j] = v[j]; 00338 00339 if(keep_in_sentence) apply_boundary(v); 00340 00341 return context_dist+i == dvm->length() ? context_dist+i : context_dist+i-1; 00342 } 00343 00344 void TextSenseSequenceVMatrix::apply_boundary(const Vec& v) const 00345 { 00346 // Looking for left boundary 00347 00348 bool found_boundary = false; 00349 for(int i=window_size/2-1; i>=0; i--) 00350 { 00351 if(v[3*i] == sentence_boundary) found_boundary = true; 00352 if(found_boundary) 00353 { 00354 v[3*i] = 0; 00355 v[3*i+1] = UNDEFINED_SS_ID; 00356 if(undefined_pos_set) v[3*i+2] = undefined_pos; 00357 else v[3*i+2] = UNDEFINED_TYPE; 00358 } 00359 } 00360 00361 // Looking for right boundary 00362 00363 found_boundary = false; 00364 for(int i=window_size/2; i<window_size; i--) 00365 { 00366 if(v[3*i] == sentence_boundary) found_boundary = true; 00367 if(found_boundary) 00368 { 00369 v[3*i] = 0; 00370 v[3*i+1] = UNDEFINED_SS_ID; 00371 if(undefined_pos_set) v[3*i+2] = undefined_pos; 00372 else v[3*i+2] = UNDEFINED_TYPE; 00373 } 00374 } 00375 } 00376 00377 void TextSenseSequenceVMatrix::permute(Vec v) const 00378 { 00379 for(int i=0; i<window_size+1; i++) 00380 { 00381 int pos = (int)v[3*i+2]; 00382 if(pos == NOUN_TYPE || pos == VERB_TYPE || pos == ADJ_TYPE || pos == ADV_TYPE) 00383 { 00384 real rand = uniform_sample(); 00385 real sum = 0; 00386 int j=0; 00387 int sense = (int)v[3*i+1]; 00388 int word_id = (int)v[3*i]; 00389 if(sense >= 0 && word_id >= 0) 00390 { 00391 for(; j<word_given_sense_priors[sense].size(); j++) 00392 { 00393 if(rand < sum + word_given_sense_priors[sense][j].second) 00394 break; 00395 sum += word_given_sense_priors[sense][j].second; 00396 } 00397 string word = wno->getWord(word_given_sense_priors[sense][j].first); 00398 string stemmed_syn = stemWord(word, pos); 00399 int syn_word_id = wno->getWordId(stemmed_syn); 00400 if(syn_word_id != -1) 00401 { 00402 TVec<int> senses_of_target_word; 00403 switch (pos) 00404 { 00405 case NOUN_TYPE: 00406 senses_of_target_word = wno->temp_word_to_noun_senses[syn_word_id]; 00407 break; 00408 case VERB_TYPE: 00409 senses_of_target_word = wno->temp_word_to_verb_senses[syn_word_id]; 00410 break; 00411 case ADJ_TYPE: 00412 senses_of_target_word = wno->temp_word_to_adj_senses[syn_word_id]; 00413 break; 00414 case ADV_TYPE: 00415 senses_of_target_word = wno->temp_word_to_adv_senses[syn_word_id]; 00416 break; 00417 case UNDEFINED_TYPE: 00418 senses_of_target_word = wno->getSensesForWord(syn_word_id); 00419 break; 00420 default: 00421 //PLERROR("weird in train, target_pos = %d", target_pos); 00422 senses_of_target_word = wno->getSensesForWord(syn_word_id); 00423 } 00424 00425 int k=0; 00426 while(k<senses_of_target_word.size()) 00427 { 00428 if(senses_of_target_word[k] == (int)v[3*i+1]) 00429 break; 00430 k++; 00431 } 00432 if(k != senses_of_target_word.size()) 00433 v[3*i] = syn_word_id; 00434 } 00435 } 00436 } 00437 } 00438 } 00439 00440 void TextSenseSequenceVMatrix::declareOptions(OptionList& ol) 00441 { 00442 declareOption(ol, "window_size", &TextSenseSequenceVMatrix::window_size, OptionBase::buildoption,"Size of the context window"); 00443 declareOption(ol, "is_supervised_data", &TextSenseSequenceVMatrix::is_supervised_data, OptionBase::buildoption,"Data of VMatrix is supervised"); 00444 declareOption(ol, "res_pos", &TextSenseSequenceVMatrix::res_pos, OptionBase::buildoption,"TVec<int> containing the POSs of the words which should not be included in the target word context"); 00445 declareOption(ol, "dvm", &TextSenseSequenceVMatrix::dvm, OptionBase::buildoption,"VMatrix that contains the triplets word/sense/POS of a corpus"); 00446 declareOption(ol, "rand_syn", &TextSenseSequenceVMatrix::rand_syn, OptionBase::buildoption,"Use same-sense random permutation of words"); 00447 declareOption(ol, "keep_in_sentence", &TextSenseSequenceVMatrix::keep_in_sentence, OptionBase::buildoption,"Indication that the context must not spread over another sentence"); 00448 declareOption(ol, "sentence_boundary", &TextSenseSequenceVMatrix::sentence_boundary, OptionBase::buildoption,"Sentence boundary symbol"); 00449 declareOption(ol, "undefined_pos_set", &TextSenseSequenceVMatrix::undefined_pos_set, OptionBase::buildoption,"Indication that the undefined pos id is defined"); 00450 declareOption(ol, "undefined_pos", &TextSenseSequenceVMatrix::undefined_pos, OptionBase::buildoption,"Undefined pos id"); 00451 inherited::declareOptions(ol); 00452 } 00453 00454 void TextSenseSequenceVMatrix::build_() 00455 { 00456 if(window_size%2 != 0) 00457 PLERROR("In TextSenseSequenceVMatrix: window_size must be even number"); 00458 if(window_size < 0) 00459 PLERROR("In TextSenseSequenceVMatrix: window_size must be non negative"); 00460 if(dvm->width() != 2 && dvm->width() != 3) 00461 PLERROR("In TextSenseSequenceVMatrix: VMat that_dvm should have width equal to 2 or 3"); 00462 00463 width_ = 3*(window_size+1); 00464 length_ = dvm->length(); 00465 fieldinfos.resize(width_); 00466 //To do: Field Infos ? 00467 00468 //oov_tag_id = wno->getWordId(OOV_TAG); 00469 00470 if(dvm->width() == 2 && rand_syn) 00471 PLWARNING("In TextSenseSequenceVMatrix: cannot use permutation of same-sense words with unsupervised data"); 00472 00473 if(dvm.isNull()) 00474 PLERROR("In TextSenseSequenceVMatrix: dvm (data of the matrix) is not defined"); 00475 if(dvm->width() == 3 && rand_syn) 00476 { 00477 if(wno == NULL) 00478 PLERROR("In TextSenseSequence: there is no WordNetOntology defined"); 00479 word_given_sense_priors.resize(wno->getSenseSize()); 00480 00481 for(int i=0; i<word_given_sense_priors.size(); i++) 00482 { 00483 Set words_for_sense = wno->getWordsForSense(i); 00484 int n_words_for_sense = words_for_sense.size(); 00485 word_given_sense_priors[i]->resize(n_words_for_sense, 1); 00486 int j=0; 00487 for(SetIterator sit = words_for_sense.begin(); sit != words_for_sense.end(); sit++,j++) 00488 { 00489 word_given_sense_priors[i][j].first = *sit; 00490 word_given_sense_priors[i][j].second = 1; 00491 } 00492 } 00493 00494 if(dvm.isNull()) 00495 PLERROR("In TextSenseSequenceVMatrix: dvm (data of the matrix) is not defined"); 00496 Vec triplet(3); 00497 for(int i=0; i<dvm.length(); i++) 00498 { 00499 dvm->getRow(i, triplet); 00500 int sense = (int)triplet[1]; 00501 int word = (int )triplet[0]; 00502 if(sense >= 0 && word >= 0) 00503 { 00504 int size = word_given_sense_priors[sense].size(); 00505 for(int j=0; j<size;j++) 00506 if(word == word_given_sense_priors[sense][j].first) 00507 { 00508 word_given_sense_priors[sense][j].second += word_given_sense_priors[sense][j].second == 1 ? 1 : 2; 00509 break; 00510 } 00511 } 00512 } 00513 00514 //Normalization 00515 for(int i=0; i<word_given_sense_priors.size(); i++) 00516 { 00517 real sum = 0; 00518 for(int j=0; j<word_given_sense_priors[i].size(); j++) 00519 sum += word_given_sense_priors[i][j].second; 00520 if(sum != 0) 00521 for(int j=0; j<word_given_sense_priors[i].size(); j++) 00522 word_given_sense_priors[i][j].second /= sum; 00523 } 00524 } 00525 } 00526 00527 void TextSenseSequenceVMatrix::build() 00528 { 00529 inherited::build(); 00530 build_(); 00531 } 00532 00533 void TextSenseSequenceVMatrix::makeDeepCopyFromShallowCopy(map<const void*, void*>& copies) 00534 { 00535 inherited::makeDeepCopyFromShallowCopy(copies); 00536 deepCopyField(dvm, copies); 00537 deepCopyField(res_pos,copies); 00538 // ### Remove this line when you have fully implemented this method. 00539 //PLERROR("TextSenseSequenceVMatrix::makeDeepCopyFromShallowCopy not fully (correctly) implemented yet!"); 00540 } 00541 00542 } // end of namespace PLearn 00543