PLearn: WordNetOntology.cc Source File

00001 // -*- C++ -*- 00002 00003 // PLearn (A C++ Machine Learning Library) 00004 // Copyright (C) 1998 Pascal Vincent 00005 // Copyright (C) 1999,2000 Pascal Vincent, Yoshua Bengio and University of Montreal 00006 // 00007 // This file is part of the PLearn Library. This library is free 00008 // software; you can redistribute it and/or modify it under the 00009 // terms of the GNU General Public License as published by the 00010 // Free Software Foundation, version 2. 00011 // 00012 // This library is distributed in the hope that it will be useful, 00013 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00015 // GNU General Public License for more details. 00016 // 00017 // You should have received a copy of the GNU General Public License 00018 // along with this library; see the file GPL.txt If not, write to the Free 00019 // Software Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 00020 // 00021 // As a special exception, you may compile and link this library with files 00022 // not covered by the GNU General Public License, and distribute the resulting 00023 // executable file under the terms of your choice, without the requirement to 00024 // distribute the complete corresponding source code, provided you have 00025 // obtained explicit written permission to do so from Pascal Vincent (primary 00026 // author of the library) or Yoshua Bengio or the University of Montreal. 00027 // This exception does not however invalidate any other reasons why the 00028 // executable file might be covered by the GNU General Public License. 00029 // 00030 // See the following URL for more information on PLearn: 00031 // http://plearn.sourceforge.net 00032 00033 00034 00035 /* ******************************************************* 00036 * $Id: WordNetOntology.cc,v 1.31 2004/07/21 16:30:57 chrish42 Exp $ 00037 * AUTHORS: Christian Jauvin 00038 * This file is part of the PLearn library. 00039 ******************************************************* */ 00040 00041 #include "WordNetOntology.h" 00042 #include <algo.h> 00043 #include <plearn/base/stringutils.h> 00044 00045 namespace PLearn { 00046 00047 using namespace std; 00048 00049 #define NOWARNING 00050 00051 WordNetOntology::WordNetOntology() 00052 { 00053 init(); 00054 createBaseSynsets(); 00055 } 00056 00057 WordNetOntology::WordNetOntology(string voc_file, 00058 bool differentiate_unknown_words, 00059 bool pre_compute_ancestors, 00060 bool pre_compute_descendants, 00061 int wn_pos_type, 00062 int word_coverage_threshold) 00063 { 00064 init(differentiate_unknown_words); 00065 createBaseSynsets(); 00066 extract(voc_file, wn_pos_type); 00067 if (pre_compute_descendants) 00068 extractDescendants(); 00069 if (pre_compute_ancestors) 00070 extractAncestors(word_coverage_threshold, true, true); 00071 } 00072 00073 WordNetOntology::WordNetOntology(string voc_file, 00074 string synset_file, 00075 string ontology_file, 00076 bool pre_compute_ancestors, 00077 bool pre_compute_descendants, 00078 int word_coverage_threshold) 00079 { 00080 init(); 00081 //createBaseSynsets(); 00082 load(voc_file, synset_file, ontology_file); 00083 if (pre_compute_descendants) 00084 extractDescendants(); 00085 if (pre_compute_ancestors) 00086 extractAncestors(word_coverage_threshold, true, true); 00087 } 00088 00089 WordNetOntology::WordNetOntology(string voc_file, 00090 string synset_file, 00091 string ontology_file, 00092 string sense_key_file, 00093 bool pre_compute_ancestors, 00094 bool pre_compute_descendants, 00095 int word_coverage_threshold) 00096 { 00097 init(); 00098 //createBaseSynsets(); 00099 load(voc_file, synset_file, ontology_file, sense_key_file); 00100 if (pre_compute_descendants) 00101 extractDescendants(); 00102 if (pre_compute_ancestors) 00103 extractAncestors(word_coverage_threshold, true, true); 00104 } 00105 00106 void WordNetOntology::init(bool the_differentiate_unknown_words) 00107 { 00108 if (wninit() != 0) { 00109 // PLERROR("WordNet init error"); 00110 } 00111 00112 noun_count = 0; 00113 verb_count = 0; 00114 adj_count = 0; 00115 adv_count = 0; 00116 00117 synset_index = EOS_SS_ID + 1; // first synset id 00118 word_index = 0; 00119 unknown_sense_index = 0; 00120 00121 noun_sense_count = 0; 00122 verb_sense_count = 0; 00123 adj_sense_count = 0; 00124 adv_sense_count = 0; 00125 00126 in_wn_word_count = 0; 00127 out_of_wn_word_count = 0; 00128 00129 are_ancestors_extracted = false; 00130 are_descendants_extracted = false; 00131 are_predominent_pos_extracted = false; 00132 are_word_high_level_senses_extracted = false; 00133 are_word_sense_unique_ids_computed = false; 00134 00135 n_word_high_level_senses = 0; 00136 00137 differentiate_unknown_words = the_differentiate_unknown_words; 00138 } 00139 00140 void WordNetOntology::createBaseSynsets() 00141 { 00142 // create ROOT synset 00143 Node* root_node = new Node(ROOT_SS_ID); 00144 root_node->syns.push_back("ROOT"); 00145 root_node->types.insert(UNDEFINED_TYPE); 00146 root_node->gloss = "(root concept)"; 00147 root_node->hereiam = ROOT_OFFSET; 00148 synsets[ROOT_SS_ID] = root_node; 00149 //root_node->visited = true; 00150 00151 // create SUPER-UNKNOWN synset 00152 Node* unk_node = new Node(SUPER_UNKNOWN_SS_ID); 00153 unk_node->syns.push_back("SUPER_UNKNOWN"); 00154 unk_node->types.insert(UNDEFINED_TYPE); 00155 unk_node->gloss = "(super-unknown concept)"; 00156 unk_node->hereiam = SUPER_UNKNOWN_OFFSET; 00157 synsets[SUPER_UNKNOWN_SS_ID] = unk_node; 00158 //unk_node->visited = true; 00159 00160 // link it <-> ROOT 00161 unk_node->parents.insert(ROOT_SS_ID); 00162 root_node->children.insert(SUPER_UNKNOWN_SS_ID); 00163 00164 // create OOV (out-of-vocabulary) synset 00165 Node* oov_node = new Node(OOV_SS_ID); 00166 oov_node->syns.push_back("OOV"); 00167 oov_node->types.insert(UNDEFINED_TYPE); 00168 oov_node->gloss = "(out-of-vocabulary)"; 00169 oov_node->hereiam = OOV_OFFSET; 00170 synsets[OOV_SS_ID] = oov_node; 00171 //oov_node->visited = true; 00172 00173 // link it <-> SUPER-UNKNOWN 00174 oov_node->parents.insert(SUPER_UNKNOWN_SS_ID); 00175 unk_node->children.insert(OOV_SS_ID); 00176 00177 // create PROPER_NOUN, NUMERIC, PUNCTUATION, BOS, EOS and STOP synsets 00178 Node* proper_node = new Node(PROPER_NOUN_SS_ID); 00179 proper_node->syns.push_back("PROPER NOUN"); 00180 proper_node->types.insert(UNDEFINED_TYPE); 00181 proper_node->gloss = "(proper noun)"; 00182 proper_node->hereiam = PROPER_NOUN_OFFSET; 00183 synsets[PROPER_NOUN_SS_ID] = proper_node; 00184 //proper_node->visited = true; 00185 00186 Node* num_node = new Node(NUMERIC_SS_ID); 00187 num_node->syns.push_back("NUMERIC"); 00188 num_node->types.insert(UNDEFINED_TYPE); 00189 num_node->gloss = "(numeric)"; 00190 num_node->hereiam = NUMERIC_OFFSET; 00191 synsets[NUMERIC_SS_ID] = num_node; 00192 //num_node->visited = true; 00193 00194 Node* punct_node = new Node(PUNCTUATION_SS_ID); 00195 punct_node->syns.push_back("PUNCTUATION"); 00196 punct_node->types.insert(UNDEFINED_TYPE); 00197 punct_node->gloss = "(punctuation)"; 00198 punct_node->hereiam = PUNCTUATION_OFFSET; 00199 synsets[PUNCTUATION_SS_ID] = punct_node; 00200 //punct_node->visited = true; 00201 00202 Node* stop_node = new Node(STOP_SS_ID); 00203 stop_node->syns.push_back("STOP"); 00204 stop_node->types.insert(UNDEFINED_TYPE); 00205 stop_node->gloss = "(stop)"; 00206 stop_node->hereiam = STOP_OFFSET; 00207 synsets[STOP_SS_ID] = stop_node; 00208 00209 Node* bos_node = new Node(BOS_SS_ID); 00210 bos_node->syns.push_back("BOS"); 00211 bos_node->types.insert(UNDEFINED_TYPE); 00212 bos_node->gloss = "(BOS)"; 00213 bos_node->hereiam = BOS_OFFSET; 00214 synsets[BOS_SS_ID] = bos_node; 00215 00216 Node* eos_node = new Node(EOS_SS_ID); 00217 eos_node->syns.push_back("EOS"); 00218 eos_node->types.insert(UNDEFINED_TYPE); 00219 eos_node->gloss = "(EOS)"; 00220 eos_node->hereiam = EOS_OFFSET; 00221 synsets[EOS_SS_ID] = eos_node; 00222 00223 // link them <-> SUPER-UNKNOWN 00224 proper_node->parents.insert(SUPER_UNKNOWN_SS_ID); 00225 unk_node->children.insert(PROPER_NOUN_SS_ID); 00226 num_node->parents.insert(SUPER_UNKNOWN_SS_ID); 00227 unk_node->children.insert(NUMERIC_SS_ID); 00228 punct_node->parents.insert(SUPER_UNKNOWN_SS_ID); 00229 unk_node->children.insert(PUNCTUATION_SS_ID); 00230 stop_node->parents.insert(SUPER_UNKNOWN_SS_ID); 00231 unk_node->children.insert(STOP_SS_ID); 00232 bos_node->parents.insert(SUPER_UNKNOWN_SS_ID); 00233 unk_node->children.insert(BOS_SS_ID); 00234 eos_node->parents.insert(SUPER_UNKNOWN_SS_ID); 00235 unk_node->children.insert(EOS_SS_ID); 00236 00237 // create NOUN, VERB, ADJECTIVE and ADVERB synsets 00238 Node* noun_node = new Node(NOUN_SS_ID); 00239 noun_node->syns.push_back("NOUN"); 00240 noun_node->types.insert(UNDEFINED_TYPE); 00241 noun_node->gloss = "(noun concept)"; 00242 noun_node->hereiam = NOUN_OFFSET; 00243 synsets[NOUN_SS_ID] = noun_node; 00244 //noun_node->visited = true; 00245 00246 Node* verb_node = new Node(VERB_SS_ID); 00247 verb_node->syns.push_back("VERB"); 00248 verb_node->types.insert(UNDEFINED_TYPE); 00249 verb_node->gloss = "(verb concept)"; 00250 verb_node->hereiam = VERB_OFFSET; 00251 synsets[VERB_SS_ID] = verb_node; 00252 //verb_node->visited = true; 00253 00254 Node* adj_node = new Node(ADJ_SS_ID); 00255 adj_node->syns.push_back("ADJECTIVE"); 00256 adj_node->types.insert(UNDEFINED_TYPE); 00257 adj_node->gloss = "(adjective concept)"; 00258 adj_node->hereiam = ADJ_OFFSET; 00259 synsets[ADJ_SS_ID] = adj_node; 00260 //adj_node->visited = true; 00261 00262 Node* adv_node = new Node(ADV_SS_ID); 00263 adv_node->syns.push_back("ADVERB"); 00264 adv_node->types.insert(UNDEFINED_TYPE); 00265 adv_node->gloss = "(adverb concept)"; 00266 adv_node->hereiam = ADV_OFFSET; 00267 synsets[ADV_SS_ID] = adv_node; 00268 //adv_node->visited = true; 00269 00270 // link them <-> ROOT 00271 noun_node->parents.insert(ROOT_SS_ID); 00272 root_node->children.insert(NOUN_SS_ID); 00273 verb_node->parents.insert(ROOT_SS_ID); 00274 root_node->children.insert(VERB_SS_ID); 00275 adj_node->parents.insert(ROOT_SS_ID); 00276 root_node->children.insert(ADJ_SS_ID); 00277 adv_node->parents.insert(ROOT_SS_ID); 00278 root_node->children.insert(ADV_SS_ID); 00279 00280 } 00281 00282 void WordNetOntology::extract(string voc_file, int wn_pos_type) 00283 { 00284 int n_lines = ShellProgressBar::getAsciiFileLineCount(voc_file); 00285 ShellProgressBar progress(0, n_lines - 1, "extracting ontology", 50); 00286 progress.draw(); 00287 ifstream input_if(voc_file.c_str()); 00288 string word; 00289 while (!input_if.eof()) 00290 { 00291 getline(input_if, word, '\n'); 00292 if (word == "") continue; 00293 if (word[0] == '#' && word[1] == '#') continue; 00294 extractWord(word, wn_pos_type, true, true, false); 00295 progress.update(word_index); 00296 } 00297 input_if.close(); 00298 progress.done(); 00299 finalize(); 00300 input_if.close(); 00301 } 00302 00303 bool WordNetOntology::isInWordNet(string word, bool trim_word, bool stem_word, bool remove_undescores) 00304 { 00305 if (trim_word) 00306 word = trimWord(word); 00307 00308 if (remove_undescores) 00309 word = underscore_to_space(word); 00310 00311 if (word == NULL_TAG) 00312 { 00313 return false; 00314 } else 00315 { 00316 bool found_noun = hasSenseInWordNet(word, NOUN_TYPE); 00317 bool found_verb = hasSenseInWordNet(word, VERB_TYPE); 00318 bool found_adj = hasSenseInWordNet(word, ADJ_TYPE); 00319 bool found_adv = hasSenseInWordNet(word, ADV_TYPE); 00320 bool found_stemmed_noun = false; 00321 bool found_stemmed_verb = false; 00322 bool found_stemmed_adj = false; 00323 bool found_stemmed_adv = false; 00324 00325 if (stem_word) 00326 { 00327 string stemmed_word = stemWord(word, NOUN); 00328 if (stemmed_word != word) 00329 found_stemmed_noun = hasSenseInWordNet(stemmed_word, NOUN_TYPE); 00330 stemmed_word = stemWord(word, VERB); 00331 if (stemmed_word != word) 00332 found_stemmed_verb = hasSenseInWordNet(stemmed_word, VERB_TYPE); 00333 stemmed_word = stemWord(word, ADJ); 00334 if (stemmed_word != word) 00335 found_stemmed_adj = hasSenseInWordNet(stemmed_word, ADJ_TYPE); 00336 stemmed_word = stemWord(word, ADV); 00337 if (stemmed_word != word) 00338 found_stemmed_adv = hasSenseInWordNet(stemmed_word, ADV_TYPE); 00339 } 00340 00341 if (found_noun || found_verb || found_adj || found_adv || 00342 found_stemmed_noun || found_stemmed_verb || found_stemmed_adj || found_stemmed_adv) 00343 { 00344 return true; 00345 } else 00346 { 00347 return false; 00348 } 00349 } 00350 } 00351 00352 bool WordNetOntology::hasSenseInWordNet(string word, int wn_pos_type) 00353 { 00354 //char* cword = const_cast<char*>(word.c_str()); 00355 char* cword = cstr(word); 00356 SynsetPtr ssp = NULL; 00357 00358 switch (wn_pos_type) 00359 { 00360 case NOUN_TYPE: 00361 ssp = findtheinfo_ds(cword, NOUN, -HYPERPTR, ALLSENSES); 00362 break; 00363 case VERB_TYPE: 00364 ssp = findtheinfo_ds(cword, VERB, -HYPERPTR, ALLSENSES); 00365 break; 00366 case ADJ_TYPE: 00367 ssp = findtheinfo_ds(cword, ADJ, -HYPERPTR, ALLSENSES); 00368 break; 00369 case ADV_TYPE: 00370 ssp = findtheinfo_ds(cword, ADV, -HYPERPTR, ALLSENSES); 00371 break; 00372 } 00373 00374 bool ssp_is_null = (ssp == NULL); 00375 00376 delete(cword); 00377 free_syns(ssp); 00378 00379 return !ssp_is_null; 00380 00381 } 00382 00383 void WordNetOntology::extractWord(string original_word, int wn_pos_type, bool trim_word, bool stem_word, bool remove_undescores) 00384 { 00385 bool found_noun = false; 00386 bool found_verb = false; 00387 bool found_adj = false; 00388 bool found_adv = false; 00389 bool found_stemmed_noun = false; 00390 bool found_stemmed_verb = false; 00391 bool found_stemmed_adj = false; 00392 bool found_stemmed_adv = false; 00393 bool found = false; 00394 string processed_word = original_word; 00395 string stemmed_word; 00396 00397 words[word_index] = original_word; 00398 words_id[original_word] = word_index; 00399 00400 if (!catchSpecialTags(original_word)) 00401 { 00402 if (trim_word) 00403 processed_word = trimWord(original_word); 00404 00405 if (remove_undescores) 00406 processed_word = underscore_to_space(processed_word); 00407 00408 if (processed_word == NULL_TAG) 00409 { 00410 out_of_wn_word_count++; 00411 processUnknownWord(word_index); 00412 word_is_in_wn[word_index] = false; 00413 } else 00414 { 00415 if (wn_pos_type == NOUN_TYPE || wn_pos_type == ALL_WN_TYPE) 00416 found_noun = extractSenses(original_word, processed_word, NOUN_TYPE); 00417 if (wn_pos_type == VERB_TYPE || wn_pos_type == ALL_WN_TYPE) 00418 found_verb = extractSenses(original_word, processed_word, VERB_TYPE); 00419 if (wn_pos_type == ADJ_TYPE || wn_pos_type == ALL_WN_TYPE) 00420 found_adj = extractSenses(original_word, processed_word, ADJ_TYPE); 00421 if (wn_pos_type == ADV_TYPE || wn_pos_type == ALL_WN_TYPE) 00422 found_adv = extractSenses(original_word, processed_word, ADV_TYPE); 00423 00424 if (stem_word) 00425 { 00426 if (wn_pos_type == NOUN_TYPE || wn_pos_type == ALL_WN_TYPE) 00427 { 00428 stemmed_word = stemWord(processed_word, NOUN); 00429 if (stemmed_word != processed_word) 00430 found_stemmed_noun = extractSenses(original_word, stemmed_word, NOUN_TYPE); 00431 } 00432 if (wn_pos_type == VERB_TYPE || wn_pos_type == ALL_WN_TYPE) 00433 { 00434 stemmed_word = stemWord(processed_word, VERB); 00435 if (stemmed_word != processed_word) 00436 found_stemmed_verb = extractSenses(original_word, stemmed_word, VERB_TYPE); 00437 } 00438 if (wn_pos_type == ADJ_TYPE || wn_pos_type == ALL_WN_TYPE) 00439 { 00440 stemmed_word = stemWord(processed_word, ADJ); 00441 if (stemmed_word != processed_word) 00442 found_stemmed_adj = extractSenses(original_word, stemmed_word, ADJ_TYPE); 00443 } 00444 if (wn_pos_type == ADV_TYPE || wn_pos_type == ALL_WN_TYPE) 00445 { 00446 stemmed_word = stemWord(processed_word, ADV); 00447 if (stemmed_word != processed_word) 00448 found_stemmed_adv = extractSenses(original_word, stemmed_word, ADV_TYPE); 00449 } 00450 } 00451 00452 found = (found_noun || found_verb || found_adj || found_adv || 00453 found_stemmed_noun || found_stemmed_verb || found_stemmed_adj || found_stemmed_adv); 00454 if (found) 00455 { 00456 in_wn_word_count++; 00457 word_is_in_wn[word_index] = true; 00458 } else 00459 { 00460 out_of_wn_word_count++; 00461 processUnknownWord(word_index); 00462 word_is_in_wn[word_index] = false; 00463 } 00464 } 00465 } else // word is a "special tag" (<OOV>, etc...) 00466 { 00467 out_of_wn_word_count++; 00468 word_is_in_wn[word_index] = false; 00469 } 00470 if (word_to_senses[word_index].isEmpty()) 00471 PLWARNING("word %d (%s) was not processed correctly (found = %d)", word_index, words[word_index].c_str(), found); 00472 word_index++; 00473 00474 } 00475 00476 00477 Node * 00478 WordNetOntology::findSynsetFromSynsAndGloss(const vector<string> &syns, const string &gloss, const long offset, const int fnum) 00479 { 00480 for (map<int, Node *>::iterator it = synsets.begin(); it != synsets.end(); ++it) { 00481 Node *node = it->second; 00482 if ((node->gloss == gloss) && (node->syns == syns) && (node->hereiam == offset) && (node->fnum == fnum)) 00483 return node; 00484 } 00485 return NULL; 00486 } 00487 00488 void 00489 WordNetOntology::extractTaggedWordFrequencies(map<int, map<int, int> > &word_senses_to_tagged_frequencies) 00490 { 00491 // NOTE: The 'word_senses_to_tagged_frequencies' is a map where the key to the 00492 // map is a 'word_id' and the value associated with the key is another 00493 // map. This other map takes a 'synset_id' as its key and associates 00494 // a frequency value. Thus the data structure associates a frequency 00495 // to a (word_id, synset_id) couple. 00496 00497 cout << "in WordNetOntology::extractTaggedWordFrequencies()" << endl; 00498 vector<int> dbases; 00499 dbases.reserve(4); 00500 dbases.push_back(NOUN); 00501 dbases.push_back(VERB); 00502 dbases.push_back(ADJ); 00503 dbases.push_back(ADV); 00504 int dbases_size = dbases.size(); 00505 00506 word_senses_to_tagged_frequencies.clear(); 00507 vector<string> syns; 00508 string gloss; 00509 long offset; 00510 int fnum; 00511 00512 int total_senses_found = 0; 00513 ShellProgressBar progress(0, words.size() * dbases_size, "[Extracting word-sense tagged frequencies]", 50); 00514 progress.draw(); 00515 int ws2tf_i = 0; 00516 00517 // Go through all databases 00518 for (int i = 0; i < dbases_size; ++i) { 00519 // Go through all words in the ontology 00520 for (map<int, string>::iterator w_it = words.begin(); w_it != words.end(); ++w_it) { 00521 progress.update(++ws2tf_i); 00522 char *cword = cstr(w_it->second); 00523 wnresults.numforms = wnresults.printcnt = 0; // Useful?? 00524 SynsetPtr ssp = findtheinfo_ds(cword, dbases[i], -HYPERPTR, ALLSENSES); 00525 if (ssp != NULL) { 00526 IndexPtr idx; 00527 SynsetPtr cursyn; 00528 while ((idx = getindex(cword, dbases[i])) != NULL) { 00529 cword = NULL; 00530 if (idx->tagged_cnt) { 00531 map<int, map<int, int> >::iterator ws2tf_it = word_senses_to_tagged_frequencies.find(w_it->first); 00532 if (ws2tf_it == word_senses_to_tagged_frequencies.end()) { 00533 word_senses_to_tagged_frequencies[w_it->first] = map<int, int>(); 00534 ws2tf_it = word_senses_to_tagged_frequencies.find(w_it->first); 00535 } 00536 //for (int l = 0; l < idx->tagged_cnt; ++l) { 00537 for (int l = 0; l < idx->sense_cnt; ++l) { 00538 if ((cursyn = read_synset(dbases[i], idx->offset[l], idx->wd)) != NULL) { 00539 //int freq = GetTagcnt(idx, l + 1); 00540 int freq = -1; 00541 wnresults.OutSenseCount[wnresults.numforms]++; 00542 // Find if synset is in ontology 00543 //if (freq) { 00544 // NOTE: We extract zero frequencies even though 00545 // this is not useful... 00546 syns = getSynsetWords(cursyn); 00547 gloss = string(cursyn->defn); 00548 offset = cursyn->hereiam; 00549 fnum = cursyn->fnum; 00550 00551 Node *node = findSynsetFromSynsAndGloss(syns, gloss, offset, fnum); 00552 if (node != NULL) { 00553 (ws2tf_it->second)[node->ss_id] = freq; 00554 ++total_senses_found; 00555 } 00556 //} 00557 free_synset(cursyn); 00558 } 00559 } 00560 } 00561 wnresults.numforms++; 00562 free_index(idx); 00563 } // while() 00564 free_syns(ssp); 00565 } // ssp != NULL 00566 } 00567 } 00568 progress.done(); 00569 cout << "FOUND A GRAND TOTAL OF " << total_senses_found << " senses" << endl; 00570 } 00571 00572 /* 00573 int WordNetOntology::extractFrequencies(string word, int whichsense, int dbase) 00574 { 00575 IndexPtr idx; 00576 SynsetPtr cursyn; 00577 int freq = 0; 00578 char *searchstr = cstr(word); 00579 char *cpstring = searchstr; 00580 wnresults.numforms = wnresults.printcnt = 0; 00581 00582 // TODO: I don't know why but sometimes I get a segmentation fault in 00583 // getindex(). So to avoid the problem I first check with 00584 // findtheinfo_ds() to see if there's something to be queried. 00585 if (findtheinfo_ds(cpstring, dbase, -HYPERPTR, whichsense) != NULL) { 00586 while ((idx = getindex(cpstring, dbase)) != NULL) { 00587 cpstring = NULL; 00588 if ((whichsense + 1) <= idx->tagged_cnt) { 00589 if ((cursyn = read_synset(dbase, idx->offset[whichsense], idx->wd)) != NULL) { 00590 if ((whichsense < idx->off_cnt) && (idx->tagged_cnt != -1)) 00591 freq = GetTagcnt(idx, whichsense + 1); 00592 wnresults.OutSenseCount[wnresults.numforms]++; 00593 free_synset(cursyn); 00594 } 00595 } 00596 wnresults.numforms++; 00597 free_index(idx); 00598 } 00599 } 00600 return freq; 00601 } 00602 */ 00603 00604 bool WordNetOntology::extractSenses(string original_word, string processed_word, int wn_pos_type) 00605 { 00606 00607 //char* cword = const_cast<char*>(processed_word.c_str()); 00608 char* cword = cstr(processed_word); 00609 SynsetPtr ssp = NULL; 00610 IndexPtr idx = getindex(cword, wn_pos_type); 00611 00612 switch (wn_pos_type) 00613 { 00614 case NOUN_TYPE: 00615 ssp = findtheinfo_ds(cword, NOUN, -HYPERPTR, ALLSENSES); 00616 break; 00617 case VERB_TYPE: 00618 ssp = findtheinfo_ds(cword, VERB, -HYPERPTR, ALLSENSES); 00619 break; 00620 case ADJ_TYPE: 00621 ssp = findtheinfo_ds(cword, ADJ, -HYPERPTR, ALLSENSES); 00622 break; 00623 case ADV_TYPE: 00624 ssp = findtheinfo_ds(cword, ADV, -HYPERPTR, ALLSENSES); 00625 break; 00626 } 00627 00628 if (ssp == NULL) 00629 { 00630 return false; 00631 } else 00632 { 00633 switch (wn_pos_type) 00634 { 00635 case NOUN_TYPE: 00636 noun_count++; 00637 break; 00638 case VERB_TYPE: 00639 verb_count++; 00640 break; 00641 case ADJ_TYPE: 00642 adj_count++; 00643 break; 00644 case ADV_TYPE: 00645 adv_count++; 00646 break; 00647 } 00648 00649 int wnsn = 0; 00650 // extract all senses for a given word 00651 while (ssp != NULL) 00652 { 00653 wnsn++; 00654 Node* node = checkForAlreadyExtractedSynset(ssp); 00655 if (node == NULL) // not found 00656 { 00657 00658 switch (wn_pos_type) 00659 { 00660 case NOUN_TYPE: 00661 noun_sense_count++; 00662 break; 00663 case VERB_TYPE: 00664 verb_sense_count++; 00665 break; 00666 case ADJ_TYPE: 00667 adj_sense_count++; 00668 break; 00669 case ADV_TYPE: 00670 adv_sense_count++; 00671 break; 00672 } 00673 00674 // create a new sense (1rst-level synset Node) 00675 node = extractOntology(ssp); 00676 } 00677 00678 int word_id = words_id[original_word]; 00679 node->types.insert(wn_pos_type); 00680 word_to_senses[word_id].insert(node->ss_id); 00681 sense_to_words[node->ss_id].insert(word_id); 00682 00683 char *charsk = WNSnsToStr(idx, wnsn); 00684 string sense_key(charsk); 00685 00686 00687 00688 pair<int, string> ss(word_id,sense_key); 00689 if (sense_key_to_ss_id.find(ss) == sense_key_to_ss_id.end()) 00690 sense_key_to_ss_id[ss] = node->ss_id; 00691 pair<int, int> ws(word_id, node->ss_id); 00692 00693 //cout << sense_key << "word_id: " << word_id << "synset " << node->ss_id << endl; 00694 00695 // e.g. green%1:13:00:: and greens%1:13:00:: 00696 // correspond to the same synset 00697 if (ws_id_to_sense_key.find(ws) == ws_id_to_sense_key.end()) 00698 ws_id_to_sense_key[ws] = sense_key; 00699 00700 // warning : should check if inserting a given sense twice (vector) 00701 // (should not happen if vocabulary contains only unique values) 00702 switch(wn_pos_type) 00703 { 00704 case NOUN_TYPE: 00705 word_to_noun_wnsn[word_id].push_back(node->ss_id); 00706 word_to_noun_senses[word_id].insert(node->ss_id); 00707 break; 00708 case VERB_TYPE: 00709 word_to_verb_wnsn[word_id].push_back(node->ss_id); 00710 word_to_verb_senses[word_id].insert(node->ss_id); 00711 break; 00712 case ADJ_TYPE: 00713 word_to_adj_wnsn[word_id].push_back(node->ss_id); 00714 word_to_adj_senses[word_id].insert(node->ss_id); 00715 break; 00716 case ADV_TYPE: 00717 word_to_adv_wnsn[word_id].push_back(node->ss_id); 00718 word_to_adv_senses[word_id].insert(node->ss_id); 00719 break; 00720 } 00721 00722 ssp = ssp->nextss; 00723 } 00724 free_syns(ssp); 00725 return true; 00726 } 00727 } 00728 00729 Node* WordNetOntology::extractOntology(SynsetPtr ssp) 00730 { 00731 Node* node = new Node(synset_index++); // increment synset counter 00732 node->syns = getSynsetWords(ssp); 00733 string defn = ssp->defn; 00734 removeDelimiters(defn, "*", "%"); 00735 removeDelimiters(defn, "|", "/"); 00736 node->gloss = defn; 00737 node->hereiam = ssp->hereiam; 00738 node->fnum = ssp->fnum; 00739 node->is_unknown = false; 00740 synsets[node->ss_id] = node; 00741 00742 ssp = ssp->ptrlist; 00743 00744 while (ssp != NULL) 00745 { 00746 Node* parent_node = checkForAlreadyExtractedSynset(ssp); 00747 if (parent_node == NULL) // create new synset Node 00748 { 00749 parent_node = extractOntology(ssp); 00750 } 00751 00752 if (parent_node->ss_id != node->ss_id && !(node->children.contains(parent_node->ss_id))) // avoid cycles (that are in fact due to errors in the WordNet database) 00753 { 00754 node->parents.insert(parent_node->ss_id); 00755 parent_node->children.insert(node->ss_id); 00756 } 00757 00758 ssp = ssp->nextss; 00759 } 00760 return node; 00761 } 00762 00763 bool WordNetOntology::catchSpecialTags(string word) 00764 { 00765 int word_id = words_id[word]; 00766 if (word == OOV_TAG) 00767 { 00768 word_to_senses[word_id].insert(OOV_SS_ID); 00769 sense_to_words[OOV_SS_ID].insert(word_id); 00770 return true; 00771 } else if (word == PROPER_NOUN_TAG) 00772 { 00773 word_to_senses[word_id].insert(PROPER_NOUN_SS_ID); 00774 sense_to_words[PROPER_NOUN_SS_ID].insert(word_id); 00775 return true; 00776 } else if (word == NUMERIC_TAG) 00777 { 00778 word_to_senses[word_id].insert(NUMERIC_SS_ID); 00779 sense_to_words[NUMERIC_SS_ID].insert(word_id); 00780 return true; 00781 } else if (word == PUNCTUATION_TAG) 00782 { 00783 word_to_senses[word_id].insert(PUNCTUATION_SS_ID); 00784 sense_to_words[PUNCTUATION_SS_ID].insert(word_id); 00785 return true; 00786 } else if (word == STOP_TAG) 00787 { 00788 word_to_senses[word_id].insert(STOP_SS_ID); 00789 sense_to_words[STOP_SS_ID].insert(word_id); 00790 return true; 00791 } else if (word == BOS_TAG) 00792 { 00793 word_to_senses[word_id].insert(BOS_SS_ID); 00794 sense_to_words[BOS_SS_ID].insert(word_id); 00795 return true; 00796 } else if (word == EOS_TAG) 00797 { 00798 word_to_senses[word_id].insert(EOS_SS_ID); 00799 sense_to_words[EOS_SS_ID].insert(word_id); 00800 return true; 00801 } 00802 return false; 00803 } 00804 00805 void WordNetOntology::lookForSpecialTags() 00806 { 00807 if (!isSense(OOV_SS_ID)) 00808 PLWARNING("no <oov> tag found"); 00809 if (!isSense(PROPER_NOUN_SS_ID)) 00810 PLWARNING("no <proper_noun> tag found"); 00811 if (!isSense(NUMERIC_SS_ID)) 00812 PLWARNING("no <numeric> tag found"); 00813 if (!isSense(PUNCTUATION_SS_ID)) 00814 PLWARNING("no <punctuation> tag found"); 00815 if (!isSense(STOP_SS_ID)) 00816 PLWARNING("no <stop> tag found"); 00817 } 00818 00819 void WordNetOntology::finalize() 00820 { 00821 propagatePOSTypes(); 00822 linkUpperCategories(); 00823 removeNonReachableSynsets(); 00824 } 00825 00826 int WordNetOntology::getWordSenseIdForWnsn(string word, int wn_pos_type, int wnsn) 00827 { 00828 if (!isWord(word)) 00829 { 00830 #ifndef NOWARNING 00831 PLWARNING("asking for a non-word (%s)", word.c_str()); 00832 #endif 00833 return WNO_ERROR; 00834 } 00835 00836 int word_id = words_id[word]; 00837 switch (wn_pos_type) 00838 { 00839 case NOUN_TYPE: 00840 if (wnsn > (int)word_to_noun_wnsn[word_id].size()) 00841 { 00842 #ifndef NOWARNING 00843 PLWARNING("invalid noun wnsn (%d)", wnsn); 00844 #endif 00845 return WNO_ERROR; 00846 } else 00847 return word_to_noun_wnsn[word_id][wnsn - 1]; 00848 break; 00849 case VERB_TYPE: 00850 if (wnsn > (int)word_to_verb_wnsn[word_id].size()) 00851 { 00852 #ifndef NOWARNING 00853 PLWARNING("invalid verb wnsn (%d)", wnsn); 00854 #endif 00855 return WNO_ERROR; 00856 } else 00857 return word_to_verb_wnsn[word_id][wnsn - 1]; 00858 break; 00859 case ADJ_TYPE: 00860 if (wnsn > (int)word_to_adj_wnsn[word_id].size()) 00861 { 00862 #ifndef NOWARNING 00863 PLWARNING("invalid adj wnsn (%d)", wnsn); 00864 #endif 00865 return WNO_ERROR; 00866 } else 00867 return word_to_adj_wnsn[word_id][wnsn - 1]; 00868 break; 00869 case ADV_TYPE: 00870 if (wnsn > (int)word_to_adv_wnsn[word_id].size()) 00871 { 00872 #ifndef NOWARNING 00873 PLWARNING("invalid adv wnsn (%d)", wnsn); 00874 #endif 00875 return WNO_ERROR; 00876 } else 00877 return word_to_adv_wnsn[word_id][wnsn - 1]; 00878 break; 00879 default: 00880 #ifndef NOWARNING 00881 PLWARNING("undefined type"); 00882 #endif 00883 return WNO_ERROR; 00884 } 00885 } 00886 00887 int WordNetOntology::getWordSenseIdForSenseKey(string lemma, string lexsn, string word) 00888 { 00889 string sense_key = lemma + "%" + lexsn; 00890 char* csense_key = cstr(sense_key); 00891 SynsetPtr ssp = GetSynsetForSense(csense_key); 00892 if (ssp != NULL) 00893 { 00894 vector<string> synset_words = getSynsetWords(ssp); 00895 string gloss = ssp->defn; 00896 int word_id = words_id[word]; 00897 long offset = ssp->hereiam; 00898 int fnum = ssp->fnum; 00899 for (SetIterator it = word_to_senses[word_id].begin(); it != word_to_senses[word_id].end(); ++it) 00900 { 00901 Node* node = synsets[*it]; 00902 if (node->syns == synset_words && node->gloss == gloss && node->hereiam == offset && node->fnum == fnum) 00903 return node->ss_id; 00904 } 00905 } 00906 return WNO_ERROR; 00907 } 00908 00909 void WordNetOntology::processUnknownWord(int word_id) 00910 { 00911 if (differentiate_unknown_words) 00912 { 00913 // create an UNKNOWN synset for a particular word 00914 Node* unk_node = new Node(synset_index++); 00915 int unknown_sense_id = unknown_sense_index++; 00916 unk_node->syns.push_back("UNKNOWN_SENSE_" + tostring(unknown_sense_id)); 00917 unk_node->gloss = "(unknown sense " + tostring(unknown_sense_id) + ")"; 00918 unk_node->types.insert(UNDEFINED_TYPE); 00919 unk_node->hereiam = EOS_OFFSET - unknown_sense_id - 1; 00920 synsets[unk_node->ss_id] = unk_node; 00921 00922 // link UNKNOWN <-> SUPER-UNKNOWN 00923 unk_node->parents.insert(SUPER_UNKNOWN_SS_ID); 00924 synsets[SUPER_UNKNOWN_SS_ID]->children.insert(unk_node->ss_id); 00925 00926 word_to_senses[word_id].insert(unk_node->ss_id); 00927 sense_to_words[unk_node->ss_id].insert(word_id); 00928 } else // all the unknown words are linked to SUPER-UNKNOWN 00929 { // (acting in this context as a sense) 00930 word_to_senses[word_id].insert(SUPER_UNKNOWN_SS_ID); 00931 sense_to_words[SUPER_UNKNOWN_SS_ID].insert(word_id); 00932 } 00933 00934 } 00935 00936 void WordNetOntology::propagatePOSTypes() 00937 { 00938 for (map<int, Set>::iterator it = sense_to_words.begin(); it != sense_to_words.end(); ++it) 00939 { 00940 Node* node = synsets[it->first]; 00941 propagatePOSTypes(node); 00942 } 00943 unvisitAll(); 00944 } 00945 00946 void WordNetOntology::propagatePOSTypes(Node* node) 00947 { 00948 node->visited = true; 00949 for (SetIterator it = node->parents.begin(); it != node->parents.end(); ++it) 00950 { 00951 Node* parent_node = synsets[*it]; 00952 for (SetIterator iit = node->types.begin(); iit != node->types.end(); ++iit) 00953 { 00954 parent_node->types.insert(*iit); 00955 } 00956 if (parent_node->types.size() > 1) 00957 { 00958 #ifndef NOWARNING 00959 PLWARNING("a synset has more than 1 type"); 00960 #endif 00961 } 00962 if (!parent_node->visited) 00963 propagatePOSTypes(parent_node); 00964 } 00965 } 00966 00967 void WordNetOntology::unvisitAll() 00968 { 00969 for (map<int, Node*>::iterator it = synsets.begin(); it != synsets.end(); ++it) 00970 it->second->visited = false; 00971 } 00972 00973 // link last-level nodes with the corresponding prior-to-ROOT POS super-category 00974 void WordNetOntology::linkUpperCategories() 00975 { 00976 for (map<int, Node*>::iterator it = synsets.begin(); it != synsets.end(); ++it) 00977 { 00978 int ss_id = it->first; 00979 Node* node = it->second; 00980 if (node->parents.size() == 0 && ss_id != ROOT_SS_ID) 00981 { 00982 bool link_directly_to_root = true; 00983 if (node->types.contains(NOUN_TYPE)) 00984 { 00985 node->parents.insert(NOUN_SS_ID); 00986 synsets[NOUN_SS_ID]->children.insert(ss_id); 00987 link_directly_to_root = false; 00988 } 00989 if (node->types.contains(VERB_TYPE)) 00990 { 00991 node->parents.insert(VERB_SS_ID); 00992 synsets[VERB_SS_ID]->children.insert(ss_id); 00993 link_directly_to_root = false; 00994 } 00995 if (node->types.contains(ADJ_TYPE)) 00996 { 00997 node->parents.insert(ADJ_SS_ID); 00998 synsets[ADJ_SS_ID]->children.insert(ss_id); 00999 link_directly_to_root = false; 01000 } 01001 if (node->types.contains(ADV_TYPE)) 01002 { 01003 node->parents.insert(ADV_SS_ID); 01004 synsets[ADV_SS_ID]->children.insert(ss_id); 01005 link_directly_to_root = false; 01006 } 01007 if (link_directly_to_root) 01008 { 01009 node->parents.insert(ROOT_SS_ID); 01010 synsets[ROOT_SS_ID]->children.insert(ss_id); 01011 } 01012 } 01013 } 01014 } 01015 01016 // void WordNetOntology::setLevels() 01017 // { 01018 // Node* root_node = synsets[ROOT_SS_ID]; 01019 // root_node->level = 0; 01020 // for (SetIterator it = root_node->children.begin(); it != root_node->children.end(); ++it) 01021 // setLevels(*it, 1); 01022 // } 01023 01024 // void WordNetOntology::setLevels(int ss_id, int level) 01025 // { 01026 // Node* node = synsets[ss_id]; 01027 // if (node->level != -1 && node->level != level) 01028 // { 01029 // PLWARNING("a synset is at 2 different levels (old level = %d, new level = %d)", node->level, level); 01030 // printSynset(ss_id, 1); 01031 // } 01032 // node->level = level; 01033 // for (SetIterator it = node->children.begin(); it != node->children.end(); ++it) 01034 // setLevels(*it, level + 1); 01035 // } 01036 01037 // look for an identical, already extracted synset 01038 Node* WordNetOntology::checkForAlreadyExtractedSynset(SynsetPtr ssp) 01039 { 01040 vector<string> syns = getSynsetWords(ssp); 01041 string gloss = ssp->defn; 01042 long offset = ssp->hereiam; 01043 int fnum = ssp->fnum; 01044 for (map<int, Node*>::iterator it = synsets.begin(); it != synsets.end(); ++it) 01045 { 01046 Node* node = it->second; 01047 if (node->syns == syns && node->gloss == gloss && node->hereiam == offset && node->fnum == fnum) 01048 { 01049 return node; 01050 } 01051 } 01052 return NULL; 01053 01054 } 01055 01056 vector<string> WordNetOntology::getSynsetWords(SynsetPtr ssp) 01057 { 01058 vector<string> syns; 01059 for (int i = 0; i < ssp->wcount; i++) 01060 { 01061 strsubst(ssp->words[i], '_', ' '); 01062 string word_i = ssp->words[i]; 01063 removeDelimiters(word_i, "*", "%"); 01064 removeDelimiters(word_i, "|", "/"); 01065 syns.push_back(word_i); 01066 } 01067 return syns; 01068 } 01069 01070 void WordNetOntology::print(bool print_ontology) 01071 { 01072 for (map<int, Set>::iterator it = word_to_senses.begin(); it != word_to_senses.end(); ++it) 01073 { 01074 cout << words[it->first] << endl; 01075 for (SetIterator iit = it->second.begin(); iit != it->second.end(); ++iit) 01076 { 01077 printSynset(*iit, 1); 01078 if (print_ontology) 01079 { 01080 printOntology(synsets[*iit], 2); 01081 } 01082 } 01083 } 01084 } 01085 01086 void WordNetOntology::printOntology(Node* node, int level) 01087 { 01088 for (SetIterator it = node->parents.begin(); it != node->parents.end(); ++it) 01089 { 01090 printSynset(*it, level); 01091 printOntology(synsets[*it], level + 1); 01092 } 01093 } 01094 01095 void WordNetOntology::printSynset(int ss_id, int indent_level) 01096 { 01097 for (int i = 0; i < indent_level; i++) cout << " "; // indent 01098 cout << "=> "; 01099 01100 for (vector<string>::iterator it = synsets[ss_id]->syns.begin(); it != synsets[ss_id]->syns.end(); ++it) 01101 { 01102 cout << *it << ", "; 01103 } 01104 cout << " (" << ss_id << ")" << endl; 01106 for (int i = 0; i < indent_level; i++) cout << " "; // indent 01107 cout << "fnum: " << synsets[ss_id]->fnum << "synset offset: " << synsets[ss_id]->hereiam << " gloss = " << synsets[ss_id]->gloss << endl; 01108 //cout << "syns = " << synsets[ss_id]->syns << endl; 01109 // cout << " {"; 01110 // for (SetIterator it = synsets[ss_id]->types.begin(); it != synsets[ss_id]->types.end(); ++it) 01111 // { 01112 // int type = *it; 01113 // switch (type) 01114 // { 01115 // case NOUN_TYPE: 01116 // cout << "noun "; 01117 // break; 01118 // case VERB_TYPE: 01119 // cout << "verb "; 01120 // break; 01121 // case ADJ_TYPE: 01122 // cout << "adjective "; 01123 // break; 01124 // case ADV_TYPE: 01125 // cout << "adverb "; 01126 // break; 01127 // case UNDEFINED_TYPE: 01128 // cout << "undefined "; 01129 // break; 01130 // } 01131 // } 01132 // cout << "}" << endl; 01133 } 01134 void WordNetOntology::printSynset(int ss_id,ostream& sout, int indent_level) 01135 { 01136 for (int i = 0; i < indent_level; i++) sout << " "; // indent 01137 sout << "=> "; 01138 01139 for (vector<string>::iterator it = synsets[ss_id]->syns.begin(); it != synsets[ss_id]->syns.end(); ++it) 01140 { 01141 sout << *it << ", "; 01142 } 01143 sout << " (" << ss_id << ")" << endl; 01144 01145 for (int i = 0; i < indent_level; i++) cout << " "; // indent 01146 sout << "gloss = " << synsets[ss_id]->gloss << endl; 01147 01148 } 01149 01150 01151 void WordNetOntology::printStats() 01152 { 01153 /* 01154 cout << getSenseSize() << " senses (" << noun_sense_count << " nouns, " << verb_sense_count << " verbs, " 01155 << adj_sense_count << " adjectives, " << adv_sense_count << " adverbs) for " << getVocSize() << " words" << endl; 01156 cout << out_of_wn_word_count << " out-of-wordnet words" << endl; 01157 cout << in_wn_word_count << " in-wordnet words" << endl; 01158 cout << noun_count << " nouns" << endl; 01159 cout << verb_count << " verbs" << endl; 01160 cout << adj_count << " adjectives" << endl; 01161 cout << adv_count << " adverbs" << endl; 01162 cout << (double)getSenseSize() / (double)getVocSize() << " senses per word on average" << endl; 01163 int all_classes = noun_count + verb_count + adj_count + adv_count; 01164 cout << (double)all_classes / (double)in_wn_count << " classes per word on average" << endl; 01165 */ 01166 cout << getVocSize() << " words in vocabulary" << endl; 01167 cout << in_wn_word_count << " in WN words" << endl; 01168 cout << out_of_wn_word_count << " out of WN words" << endl; 01169 cout << getSenseSize() << " senses (" << (real)getSenseSize() / (real)getVocSize() << " senses per word on average)" << endl; 01170 cout << getSynsetSize() << " categories (ontology : sense + category, possible overlap)" << endl; 01171 if (are_word_high_level_senses_extracted) 01172 { 01173 cout << n_word_high_level_senses << " high-level senses (" << (real)n_word_high_level_senses / (real)getVocSize() << " high-level senses per word on average)" << endl; 01174 } 01175 } 01176 01177 void WordNetOntology::save(string synset_file, string ontology_file) 01178 { 01179 // synset 01180 ofstream of_synsets(synset_file.c_str()); 01181 for (map<int, Node*>::iterator it = synsets.begin(); it != synsets.end(); ++it) 01182 { 01183 int ss_id = it->first; 01184 Node* node = it->second; 01185 of_synsets << ss_id << "*|"; 01186 for (SetIterator it = node->types.begin(); it != node->types.end(); ++it) 01187 { 01188 of_synsets << *it << "|"; 01189 } 01190 of_synsets << "*|"; 01191 of_synsets << node->gloss << "|"; 01192 for (vector<string>::iterator iit = node->syns.begin(); iit != node->syns.end(); ++iit) 01193 { 01194 of_synsets << *iit << "|"; 01195 } 01196 of_synsets << "*|"; 01197 of_synsets << node->fnum << "|"; 01198 of_synsets << node->hereiam << "|"; 01199 of_synsets << endl; 01200 } 01201 of_synsets.close(); 01202 01203 // ontology 01204 ofstream of_ontology(ontology_file.c_str()); 01205 for (map<int, Set>::iterator wit = word_to_senses.begin(); wit != word_to_senses.end(); ++wit) 01206 { 01207 int word_id = wit->first; 01208 of_ontology << "w " << word_id << " " << word_is_in_wn[word_id] << endl; 01209 } 01210 for (map<int, Node*>::iterator it = synsets.begin(); it != synsets.end(); ++it) 01211 { 01212 int id = it->first; 01213 Node* node = it->second; 01214 for(SetIterator iit = node->children.begin(); iit != node->children.end(); ++iit) 01215 { 01216 int child_id = *iit; 01217 of_ontology << "c " << id << " " << child_id << endl; 01218 } 01219 if (sense_to_words.find(id) != sense_to_words.end()) 01220 { 01221 for (SetIterator iit = sense_to_words[id].begin(); iit != sense_to_words[id].end(); ++iit) 01222 of_ontology << "s " << id << " " << (*iit) << endl; 01223 } 01224 } 01225 01226 of_ontology.close(); 01227 } 01228 01229 void WordNetOntology::save(string voc_file) 01230 { 01231 ofstream of_voc(voc_file.c_str()); 01232 for (map<int, string>::iterator it = words.begin(); it != words.end(); ++it) 01233 { 01234 of_voc << it->second << endl; 01235 } 01236 of_voc.close(); 01237 } 01238 01239 01240 void WordNetOntology::saveVocInWordnet(string voc_file) 01241 { 01242 ofstream of_voc(voc_file.c_str()); 01243 for (map<int, string>::iterator it = words.begin(); it != words.end(); ++it) 01244 { 01245 if (word_is_in_wn[it->first] == false)continue; 01246 of_voc << it->second << endl; 01247 } 01248 of_voc.close(); 01249 } 01250 01251 01252 01253 01254 void WordNetOntology::save(string synset_file, string ontology_file, string sense_key_file) 01255 { 01256 save(synset_file, ontology_file); 01257 01258 ofstream of_voc(sense_key_file.c_str()); 01259 for (map<pair<int, int>, string>::iterator it = ws_id_to_sense_key.begin(); it != ws_id_to_sense_key.end(); ++it) 01260 { 01261 of_voc << it->second << " " << (it->first).first << " " << (it->first).second << endl; 01262 } 01263 of_voc.close(); 01264 } 01265 01266 01267 void WordNetOntology::load(string voc_file, string synset_file, string ontology_file) 01268 { 01269 ifstream if_voc(voc_file.c_str()); 01270 if (!if_voc) PLERROR("can't open %s", voc_file.c_str()); 01271 ifstream if_synsets(synset_file.c_str()); 01272 if (!if_synsets) PLERROR("can't open %s", synset_file.c_str()); 01273 ifstream if_ontology(ontology_file.c_str()); 01274 if (!if_ontology) PLERROR("can't open %s", ontology_file.c_str()); 01275 01276 string line; 01277 int word_count = 0; 01278 while (!if_voc.eof()) // voc 01279 { 01280 getline(if_voc, line, '\n'); 01281 if (line == "") continue; 01282 if (line[0] == '#' && line[1] == '#') continue; 01283 words_id[line] = word_count; 01284 word_to_senses[word_count] = Set(); 01285 words[word_count++] = line; 01286 } 01287 if_voc.close(); 01288 word_index = word_count; 01289 int line_no = 0; 01290 int ss_id = -1; 01291 while (!if_synsets.eof()) // synsets 01292 { 01293 ++line_no; 01294 getline(if_synsets, line, '\n'); 01295 if (line == "") continue; 01296 if (line[0] == '#') continue; 01297 vector<string> tokens = split(line, "*"); 01298 if (tokens.size() != 3 && tokens.size() != 4) 01299 PLERROR("the synset file has not the expected format, line %d = '%s'", line_no, line.c_str()); 01300 if(tokens.size() == 3 && line_no == 1) 01301 PLWARNING("The synset file doesn't contain enough information for correct representation of the synsets!"); 01302 ss_id = toint(tokens[0]); 01303 vector<string> type_tokens = split(tokens[1], "|"); 01304 vector<string> ss_tokens = split(tokens[2], "|"); 01305 vector<string> offset_tokens; 01306 if(tokens.size() == 4) offset_tokens = split(tokens[3],"|"); 01307 Node* node = new Node(ss_id); 01308 for (unsigned int i = 0; i < type_tokens.size(); i++) 01309 node->types.insert(toint(type_tokens[i])); 01310 node->gloss = ss_tokens[0]; 01311 //node->syns.reserve(ss_tokens.size() - 1); 01312 for (unsigned int i = 1; i < ss_tokens.size(); i++) 01313 { 01314 if (i == 1) // extract unknown_sense_index 01315 if (startsWith(ss_tokens[i], "UNKNOWN_SENSE_")) 01316 unknown_sense_index = toint(ss_tokens[i].substr(14, ss_tokens[i].size())) + 1; 01317 node->syns.push_back(ss_tokens[i]); 01318 } 01319 if(tokens.size() == 4) 01320 { 01321 node->fnum = toint(offset_tokens[0]); 01322 node->hereiam = tolong(offset_tokens[1]); 01323 } 01324 synsets[node->ss_id] = node; 01325 } 01326 synset_index = ss_id + 1; 01327 if_synsets.close(); 01328 int n_lines = ShellProgressBar::getAsciiFileLineCount(ontology_file); 01329 ShellProgressBar progress(0, n_lines - 1, "loading ontology", 50); 01330 progress.draw(); 01331 int counter = 0; 01332 while (!if_ontology.eof()) // ontology 01333 { 01334 getline(if_ontology, line, '\n'); 01335 progress.update(counter++); 01336 if (line == "") continue; 01337 if (line[0] == '#') continue; 01338 vector<string> tokens = split(line); 01339 if (tokens.size() != 3) 01340 { 01341 PLERROR("the ontology file has not the expected format"); 01342 } 01343 int id = toint(tokens[1]); 01344 int child_id; 01345 01346 if (tokens[0] == "w") 01347 { 01348 bool is_in_wn = tobool(tokens[2]); 01349 word_is_in_wn[id] = is_in_wn; 01350 if (is_in_wn) 01351 in_wn_word_count++; 01352 else 01353 out_of_wn_word_count++; 01354 } else if (tokens[0] == "s") 01355 { 01356 child_id = toint(tokens[2]); 01357 word_to_senses[child_id].insert(id); 01358 sense_to_words[id].insert(child_id); 01359 for (SetIterator tit = synsets[id]->types.begin(); tit != synsets[id]->types.end(); ++tit) 01360 { 01361 int type = *tit; 01362 switch (type) 01363 { 01364 case NOUN_TYPE: 01365 word_to_noun_senses[child_id].insert(id); 01366 break; 01367 case VERB_TYPE: 01368 word_to_verb_senses[child_id].insert(id); 01369 break; 01370 case ADJ_TYPE: 01371 word_to_adj_senses[child_id].insert(id); 01372 break; 01373 case ADV_TYPE: 01374 word_to_adv_senses[child_id].insert(id); 01375 break; 01376 } 01377 } 01378 } else if (tokens[0] == "c") 01379 { 01380 child_id = toint(tokens[2]); 01381 synsets[child_id]->parents.insert(id); 01382 synsets[id]->children.insert(child_id); 01383 } 01384 } 01385 if_ontology.close(); 01386 progress.done(); 01387 if_voc.close(); 01388 if_synsets.close(); 01389 if_ontology.close(); 01390 } 01391 01392 void WordNetOntology::load(string voc_file, string synset_file, string ontology_file, string sense_key_file) 01393 { 01394 load(voc_file, synset_file, ontology_file); 01395 01396 ifstream if_sense_key(sense_key_file.c_str()); 01397 if (!if_sense_key) PLERROR("can't open %s", sense_key_file.c_str()); 01398 01399 string line; 01400 while (!if_sense_key.eof()) // voc 01401 { 01402 getline(if_sense_key, line, '\n'); 01403 if (line == "") continue; 01404 if (line[0] == '#' && line[1] == '#') continue; 01405 vector<string> tokens = split(line, " "); 01406 if(tokens.size() != 3) 01407 PLERROR("sense_key_file %s not compatible", sense_key_file.c_str()); 01408 pair<int, string> ss(toint(tokens[1]), tokens[0]); 01409 sense_key_to_ss_id[ss] = toint(tokens[2]); 01410 pair<int, int> ws(toint(tokens[1]), toint(tokens[2])); 01411 ws_id_to_sense_key[ws] = tokens[0]; 01412 } 01413 if_sense_key.close(); 01414 } 01415 01416 void WordNetOntology::printNodes() 01417 { 01418 for (map<int, Node*>::iterator it = synsets.begin(); it != synsets.end(); ++it) 01419 { 01420 Node* node = it->second; 01421 cout << "Node id = " << node->ss_id << " | parents = "; 01422 for (SetIterator pit = node->parents.begin(); pit != node->parents.end(); ++pit) 01423 { 01424 cout << *pit << " "; 01425 } 01426 cout << " | children = "; 01427 for (SetIterator cit = node->children.begin(); cit != node->children.end(); ++cit) 01428 { 01429 cout << *cit << " "; 01430 } 01431 cout << endl; 01432 } 01433 } 01434 01435 void WordNetOntology::extractAncestors(int threshold, bool cut_with_word_coverage, bool exclude_itself) 01436 { 01437 #ifdef VERBOSE 01438 cout << "extracting ancestors... "; 01439 #endif 01440 01441 if (cut_with_word_coverage && !are_descendants_extracted) 01442 { 01443 cout << "*** I need to extract descendants before I can extract ancestors with a word coverage threshold ***" << endl; 01444 extractDescendants(); 01445 } 01446 01447 // synsets -> ancestors 01448 int n_sense_ancestors = 0; 01449 for (map<int, Node*>::iterator it = synsets.begin(); it != synsets.end(); ++it) 01450 { 01451 int ss = it->first; 01452 Node* node = it->second; 01453 Set ancestors; 01454 if (cut_with_word_coverage) 01455 extractAncestors(node, ancestors, threshold); 01456 else 01457 extractAncestors(node, ancestors, 1, threshold); 01458 if (!exclude_itself) 01459 ancestors.insert(ss); 01460 synset_to_ancestors[ss] = ancestors; 01461 n_sense_ancestors += ancestors.size(); 01462 } 01463 01464 are_ancestors_extracted = true; 01465 01466 // words -> ancestors 01467 int n_word_ancestors = 0; 01468 for (map<int, Set>::iterator it = word_to_senses.begin(); it != word_to_senses.end(); ++it) 01469 { 01470 int word_id = it->first; 01471 Set senses = it->second; 01472 Set word_ancestors; 01473 for (SetIterator it = senses.begin(); it != senses.end(); it++) 01474 { 01475 int sense_id = *it; 01476 Set ancestors = getSynsetAncestors(sense_id); 01477 word_ancestors.merge(ancestors); 01478 word_ancestors.insert(sense_id); 01479 } 01480 word_to_ancestors[word_id] = word_ancestors; 01481 n_word_ancestors += word_ancestors.size(); 01482 } 01483 01484 #ifdef VERBOSE 01485 cout << "(" << n_sense_ancestors << " sense ancestors, " << n_word_ancestors << " word ancestors)" << endl; 01486 #endif 01487 01488 } 01489 01490 // "word coverage threshold" version 01491 void WordNetOntology::extractAncestors(Node* node, Set ancestors, int word_coverage_threshold) 01492 { 01493 /* 01494 int ss_id = node->ss_id; 01495 if (word_coverage_threshold == -1 || synset_to_word_descendants[ss_id].size() < word_coverage_threshold) 01496 { 01497 ancestors.insert(ss_id); 01498 for (SetIterator it = node->parents.begin(); it != node->parents.end(); ++it) 01499 { 01500 extractAncestors(synsets[*it], ancestors, word_coverage_threshold); 01501 } 01502 } 01503 */ 01504 01505 for (SetIterator it = node->parents.begin(); it != node->parents.end(); ++it) 01506 { 01507 int ss_id = *it; 01508 if (word_coverage_threshold == -1 || synset_to_word_descendants[ss_id].size() < word_coverage_threshold) 01509 { 01510 ancestors.insert(ss_id); 01511 extractAncestors(synsets[ss_id], ancestors, word_coverage_threshold); 01512 } 01513 } 01514 } 01515 01516 // "level threshold" version 01517 void WordNetOntology::extractAncestors(Node* node, Set ancestors, int level, int level_threshold) 01518 { 01519 for (SetIterator it = node->parents.begin(); it != node->parents.end(); ++it) 01520 { 01521 ancestors.insert(*it); 01522 if (level_threshold == -1 || level < level_threshold) 01523 extractAncestors(synsets[*it], ancestors, level + 1, level_threshold); 01524 } 01525 } 01526 01527 Set WordNetOntology::getSynsetAncestors(int id, int max_level) 01528 { 01529 if (are_ancestors_extracted) 01530 { 01531 if (!isSynset(id)) 01532 { 01533 #ifndef NOWARNING 01534 PLWARNING("asking for a non-synset id (%d)", id); 01535 #endif 01536 } 01537 return synset_to_ancestors[id]; 01538 } else 01539 { 01540 Set ancestors; 01541 if (isSynset(id)) 01542 { 01543 #ifndef NOWARNING 01544 PLWARNING("using non-pre-computed version"); 01545 #endif 01546 extractAncestors(synsets[id], ancestors, 1, max_level); 01547 } else 01548 { 01549 #ifndef NOWARNING 01550 PLWARNING("asking for a non-synset id (%d)", id); 01551 #endif 01552 } 01553 return ancestors; 01554 } 01555 } 01556 01557 Set WordNetOntology::getSynsetParents(int id) 01558 { 01559 return synsets[id]->parents; 01560 } 01561 01562 Set WordNetOntology::getWordAncestors(int id, int max_level) 01563 { 01564 if (are_ancestors_extracted) 01565 { 01566 if (!isWord(id)) 01567 { 01568 #ifndef NOWARNING 01569 PLWARNING("asking for a non-word id (%d)", id); 01570 #endif 01571 } 01572 return word_to_ancestors[id]; 01573 } else 01574 { 01575 Set word_ancestors; 01576 if (isWord(id)) 01577 { 01578 #ifndef NOWARNING 01579 PLWARNING("using non-pre-computed version"); 01580 #endif 01581 for (SetIterator it = word_to_senses[id].begin(); it != word_to_senses[id].end(); ++it) 01582 { 01583 int sense_id = *it; 01584 word_ancestors.insert(sense_id); 01585 Set synset_ancestors = getSynsetAncestors(sense_id, max_level); 01586 word_ancestors.merge(synset_ancestors); 01587 } 01588 } else 01589 { 01590 #ifndef NOWARNING 01591 PLWARNING("asking for a non-word id"); 01592 #endif 01593 } 01594 01595 return word_ancestors; 01596 } 01597 } 01598 01599 bool WordNetOntology::isInWordNet(int word_id) 01600 { 01601 #ifndef NOWARNING 01602 if (!isWord(word_id)) 01603 { 01604 PLWARNING("asking for a non-word id (%d)", word_id); 01605 return false; 01606 } 01607 #endif 01608 return word_is_in_wn[word_id]; 01609 } 01610 01611 string WordNetOntology::getSenseKey(int word_id, int ss_id) 01612 { 01613 pair<int, int> ws(word_id, ss_id); 01614 if (ws_id_to_sense_key.find(ws) == ws_id_to_sense_key.end()) 01615 return ""; 01616 return ws_id_to_sense_key[ws]; 01617 01618 } 01619 01620 int WordNetOntology::getSynsetIDForSenseKey(int word_id, string sense_key) 01621 { 01622 pair<int, string> ss(word_id,sense_key); 01623 map< pair<int, string>, int>::iterator it = sense_key_to_ss_id.find(ss); 01624 if(it == sense_key_to_ss_id.end()) 01625 return -1; 01626 else 01627 return it->second; 01628 } 01629 01630 int WordNetOntology::getWordId(string word) 01631 { 01632 map<string, int>::iterator it = words_id.find(word); 01633 if (it == words_id.end()) 01634 { 01635 map<string, int>::iterator iit = words_id.find(OOV_TAG); 01636 if (iit == words_id.end()) 01637 return -1; 01638 else 01639 return iit->second; 01640 } else 01641 { 01642 return it->second; 01643 } 01644 01645 // #ifndef NOWARNING 01646 // if (words_id.find(word) == words_id.end()) 01647 // { 01648 // PLWARNING("asking for a non-word (%s)", word.c_str()); 01649 // return -1; 01650 // } 01651 // #endif 01652 // return words_id[word]; 01653 } 01654 01655 string WordNetOntology::getWord(int id) 01656 { 01657 #ifndef NOWARNING 01658 if (!isWord(id)) 01659 { 01660 PLWARNING("asking for a non-word id (%d)", id); 01661 return NULL_TAG; 01662 } 01663 #endif 01664 return words[id]; 01665 } 01666 01667 Set WordNetOntology::getWordSenses(int id) 01668 { 01669 #ifndef NOWARNING 01670 if (!isWord(id)) 01671 { 01672 PLWARNING("asking for a non-word id (%d)", id); 01673 return Set(); 01674 } 01675 #endif 01676 return word_to_senses[id]; 01677 } 01678 01679 Set WordNetOntology::getWordHighLevelSenses(int id) 01680 { 01681 #ifndef NOWARNING 01682 if (!isWord(id)) 01683 { 01684 PLWARNING("asking for a non-word id (%d)", id); 01685 return Set(); 01686 } 01687 #endif 01688 01689 if (!are_word_high_level_senses_extracted) 01690 PLERROR("word high-level senses have not been extracted"); 01691 01692 return word_to_high_level_senses[id]; 01693 } 01694 01695 Set WordNetOntology::getWordNounSenses(int id) 01696 { 01697 #ifndef NOWARNING 01698 if (!isWord(id)) 01699 { 01700 PLWARNING("asking for a non-word id (%d)", id); 01701 return Set(); 01702 } 01703 #endif 01704 return word_to_noun_senses[id]; 01705 } 01706 01707 Set WordNetOntology::getWordVerbSenses(int id) 01708 { 01709 #ifndef NOWARNING 01710 if (!isWord(id)) 01711 { 01712 PLWARNING("asking for a non-word id (%d)", id); 01713 return Set(); 01714 } 01715 #endif 01716 return word_to_verb_senses[id]; 01717 } 01718 01719 Set WordNetOntology::getWordAdjSenses(int id) 01720 { 01721 #ifndef NOWARNING 01722 if (!isWord(id)) 01723 { 01724 PLWARNING("asking for a non-word id (%d)", id); 01725 return Set(); 01726 } 01727 #endif 01728 return word_to_adj_senses[id]; 01729 } 01730 01731 Set WordNetOntology::getWordAdvSenses(int id) 01732 { 01733 #ifndef NOWARNING 01734 if (!isWord(id)) 01735 { 01736 PLWARNING("asking for a non-word id (%d)", id); 01737 return Set(); 01738 } 01739 #endif 01740 return word_to_adv_senses[id]; 01741 } 01742 01743 Set WordNetOntology::getWordsForSense(int id) 01744 { 01745 #ifndef NOWARNING 01746 if (!isSense(id)) 01747 { 01748 PLWARNING("asking for a non-sense id (%d)", id); 01749 return Set(); 01750 } 01751 #endif 01752 return sense_to_words[id]; 01753 } 01754 01755 Node* WordNetOntology::getSynset(int id) 01756 { 01757 #ifndef NOWARNING 01758 if (!isSynset(id)) 01759 { 01760 PLWARNING("asking for a non-synset id (%d)", id); 01761 return NULL; 01762 } 01763 #endif 01764 #ifndef NOWARNING 01765 if (synsets.find(id) == synsets.end()) { 01766 PLWARNING("Asking for a non-existent synset id (%d)", id); 01767 return NULL; 01768 } 01769 #endif 01770 return synsets[id]; 01771 } 01772 01773 void WordNetOntology::printSynsetAncestors() 01774 { 01775 if (!are_ancestors_extracted) 01776 { 01777 extractAncestors(WORD_COVERAGE_THRESHOLD, true, true); 01778 } 01779 for (map<int, Set>::iterator it = synset_to_ancestors.begin(); it != synset_to_ancestors.end(); ++it) 01780 { 01781 cout << it->first << " -> "; 01782 for (SetIterator iit = it->second.begin(); iit != it->second.end(); ++iit) 01783 cout << *iit << " "; 01784 cout << endl; 01785 } 01786 } 01787 01788 void WordNetOntology::printWordAncestors() 01789 { 01790 if (!are_ancestors_extracted) 01791 { 01792 extractAncestors(WORD_COVERAGE_THRESHOLD, true, true); 01793 } 01794 for (map<int, Set>::iterator it = word_to_senses.begin(); it != word_to_senses.end(); ++it) 01795 { 01796 int id = it->first; 01797 cout << id << " -> "; 01798 Set ancestors = getWordAncestors(id); 01799 for (SetIterator iit = ancestors.begin(); iit != ancestors.end(); ++iit) 01800 { 01801 cout << *iit << " "; 01802 } 01803 cout << endl; 01804 } 01805 } 01806 01807 void WordNetOntology::extractDescendants() 01808 { 01809 #ifdef VERBOSE 01810 cout << "extracting descendants... "; 01811 #endif 01812 01813 int n_sense_descendants = 0; 01814 int n_word_descendants = 0; 01815 for (map<int, Node*>::iterator it = synsets.begin(); it != synsets.end(); ++it) 01816 { 01817 Set sense_descendants; 01818 Set word_descendants; 01819 extractDescendants(it->second, sense_descendants, word_descendants); 01820 synset_to_sense_descendants[it->first] = sense_descendants; 01821 synset_to_word_descendants[it->first] = word_descendants; 01822 n_sense_descendants += sense_descendants.size(); 01823 n_word_descendants += word_descendants.size(); 01824 } 01825 are_descendants_extracted = true; 01826 01827 #ifdef VERBOSE 01828 cout << "(" << n_sense_descendants << " senses, " << n_word_descendants << " words)" << endl; 01829 #endif 01830 01831 } 01832 01833 void WordNetOntology::extractDescendants(Node* node, Set sense_descendants, Set word_descendants) 01834 { 01835 int ss_id = node->ss_id; 01836 if (isSense(ss_id)) // is a sense 01837 { 01838 sense_descendants.insert(ss_id); 01839 for (SetIterator it = sense_to_words[ss_id].begin(); it != sense_to_words[ss_id].end(); ++it) 01840 { 01841 int word_id = *it; 01842 word_descendants.insert(word_id); 01843 } 01844 } 01845 for (SetIterator it = node->children.begin(); it != node->children.end(); ++it) 01846 { 01847 extractDescendants(synsets[*it], sense_descendants, word_descendants); 01848 } 01849 } 01850 01851 // Extract descendant but does not include itself in the sense descendant 01852 void WordNetOntology::extractStrictDescendants(Node* node, Set sense_descendants, Set word_descendants) 01853 { 01854 int ss_id = node->ss_id; 01855 if (isSense(ss_id)){ // is a sense 01856 for (SetIterator it = sense_to_words[ss_id].begin(); it != sense_to_words[ss_id].end(); ++it){ 01857 int word_id = *it; 01858 word_descendants.insert(word_id); 01859 } 01860 } 01861 for (SetIterator it = node->children.begin(); it != node->children.end(); ++it){ 01862 extractDescendants(synsets[*it], sense_descendants, word_descendants); 01863 } 01864 } 01865 01866 01867 Set WordNetOntology::getSynsetSenseDescendants(int id) 01868 { 01869 if (are_descendants_extracted) 01870 { 01871 if (!isSynset(id)) 01872 { 01873 #ifndef NOWARNING 01874 PLWARNING("asking for a non-synset id (%d)", id); 01875 #endif 01876 } 01877 return synset_to_sense_descendants[id]; 01878 } 01879 01880 Set sense_descendants; 01881 if (isSynset(id)) 01882 { 01883 #ifndef NOWARNING 01884 PLWARNING("using non-pre-computed version"); 01885 #endif 01886 extractDescendants(synsets[id], sense_descendants, Set()); 01887 } else 01888 { 01889 #ifndef NOWARNING 01890 PLWARNING("asking for non-synset id (%d)", id); 01891 #endif 01892 } 01893 return sense_descendants; 01894 } 01895 01896 Set WordNetOntology::getSynsetWordDescendants(int id) 01897 { 01898 if (are_descendants_extracted) 01899 { 01900 if (!isSynset(id)) 01901 { 01902 #ifndef NOWARNING 01903 PLWARNING("asking for a non-synset id (%d)", id); 01904 #endif 01905 } 01906 return synset_to_word_descendants[id]; 01907 } 01908 01909 Set word_descendants; 01910 if (isSynset(id)) 01911 { 01912 #ifndef NOWARNING 01913 PLWARNING("using non-pre-computed version"); 01914 #endif 01915 extractDescendants(synsets[id], Set(), word_descendants); 01916 } else 01917 { 01918 #ifndef NOWARNING 01919 PLWARNING("asking for non-synset id (%d)", id); 01920 #endif 01921 } 01922 return word_descendants; 01923 } 01924 01925 void WordNetOntology::printDescendants() 01926 { 01927 /* 01928 if (!are_descendants_extracted) 01929 { 01930 extractDescendants(); 01931 } 01932 for (map<int, Set>::iterator it = synset_to_descendants.begin(); it != synset_to_descendants.end(); ++it) 01933 { 01934 cout << it->first << " -> "; 01935 for (SetIterator iit = it->second.begin(); iit != it->second.end(); ++iit) 01936 cout << *iit << " "; 01937 cout << endl; 01938 } 01939 */ 01940 } 01941 01942 bool WordNetOntology::isWord(int id) 01943 { 01944 return (words.find(id) != words.end()); 01945 } 01946 01947 bool WordNetOntology:: isWord(string word) 01948 { 01949 return (words_id.find(word) != words_id.end()); 01950 } 01951 01952 bool WordNetOntology::isSense(int id) 01953 { 01954 return (sense_to_words.find(id) != sense_to_words.end()); 01955 } 01956 01957 bool WordNetOntology::isPureSense(int id) 01958 { 01959 return (isSense(id) && synsets[id]->children.size() == 0); 01960 } 01961 01962 bool WordNetOntology::isCategory(int id) 01963 { 01964 return isSynset(id); 01965 } 01966 01967 bool WordNetOntology::isPureCategory(int id) 01968 { 01969 return (isCategory(id) && !isSense(id)); 01970 } 01971 01972 bool WordNetOntology::isSynset(int id) 01973 { 01974 return (synsets.find(id) != synsets.end()); 01975 } 01976 01977 int WordNetOntology::overlappingSynsets(int ss_id1, int ss_id2) 01978 { 01979 Set words1 = sense_to_words[ss_id1]; 01980 Set words2 = sense_to_words[ss_id2]; 01981 Set overlap; 01982 for (SetIterator it1=words1.begin();it1!=words1.end();++it1) 01983 if (words2.contains(*it1)) 01984 overlap.insert(*it1); 01985 //for (set<int>::iterator it=overlap.begin();it!=overlap.end();++it) 01986 // cout << words[*it] << endl; 01987 return overlap.size(); 01988 } 01989 01990 Set WordNetOntology::getAllWords() 01991 { 01992 Set all_words; 01993 for (map<int, string>::iterator it = words.begin(); it != words.end(); ++it) 01994 { 01995 all_words.insert(it->first); 01996 } 01997 return all_words; 01998 } 01999 02000 Set WordNetOntology::getAllSenses() 02001 { 02002 Set senses; 02003 for (map<int, Set>::iterator it = sense_to_words.begin(); it != sense_to_words.end(); ++it) 02004 { 02005 senses.insert(it->first); 02006 } 02007 return senses; 02008 } 02009 02010 Set WordNetOntology::getAllCategories() 02011 { 02012 Set categories; 02013 for (map<int, Node*>::iterator it = synsets.begin(); it != synsets.end(); ++it) 02014 { 02015 categories.insert(it->first); 02016 } 02017 return categories; 02018 } 02019 02020 void WordNetOntology::printWordOntology(int id) 02021 { 02022 cout << words[id] << endl; 02023 for (SetIterator sit = word_to_senses[id].begin(); sit != word_to_senses[id].end(); ++sit) 02024 { 02025 int sense_id = *sit; 02026 printSynset(sense_id, 1); 02027 printOntology(synsets[sense_id], 2); 02028 } 02029 } 02030 02031 void WordNetOntology::printWordOntology(string word) 02032 { 02033 printWordOntology(words_id[word]); 02034 } 02035 02036 void WordNetOntology::printInvertedSynsetOntology(int id, int level) 02037 { 02038 if (isSynset(id)) 02039 { 02040 printSynset(id, level); 02041 for (SetIterator it = synsets[id]->children.begin(); it != synsets[id]->children.end(); ++it) 02042 { 02043 printInvertedSynsetOntology(*it, level + 1); 02044 } 02045 } else 02046 { 02047 #ifndef NOWARNING 02048 PLWARNING("asking for a non-synset id (%d)", id); 02049 #endif 02050 } 02051 } 02052 02053 void WordNetOntology::intersectAncestorsAndSenses(Set categories, Set senses) 02054 { 02055 // pour tous les mappings "mot -> ancetres", fait une intersection de "ancetres" 02056 // avec "categories" 02057 for (map<int, Set>::iterator it = word_to_ancestors.begin(); it != word_to_ancestors.end(); ++it) 02058 { 02059 it->second.intersection(categories); 02060 } 02061 02062 // pour tous les mappings "synset -> ancetres" (ou "synset" = "sense" U "category") 02063 // enleve le mapping complet, si "synset" (la cle) n'intersecte pas avec "categories" 02064 Set keys_to_be_removed; 02065 for (map<int, Set>::iterator it = synset_to_ancestors.begin(); it != synset_to_ancestors.end(); ++it) 02066 { 02067 if (!categories.contains(it->first) && !senses.contains(it->first)) 02068 keys_to_be_removed.insert(it->first); 02069 } 02070 // purge synset_to_ancestors 02071 for (SetIterator it = keys_to_be_removed.begin(); it != keys_to_be_removed.end(); ++it) 02072 { 02073 synset_to_ancestors.erase(*it); 02074 synsets.erase(*it); 02075 } 02076 02077 // pour tous les mappings "synset -> ancetres" restants (ou "synset" = "sense" U "category") 02078 // fait une intersection de "ancetres" avec "categories" 02079 for (map<int, Set>::iterator it = synset_to_ancestors.begin(); it != synset_to_ancestors.end(); ++it) 02080 { 02081 it->second.intersection(categories); 02082 } 02083 02084 // pour tous les mappings "mot -> senses", fait une intersection de "senses" 02085 // avec "senses" 02086 for (map<int, Set>::iterator it = word_to_senses.begin(); it != word_to_senses.end(); ++it) 02087 { 02088 it->second.intersection(senses); 02089 } 02090 02091 keys_to_be_removed->clear(); 02092 for (map<int, Set>::iterator it = sense_to_words.begin(); it != sense_to_words.end(); ++it) 02093 { 02094 if (!senses.contains(it->first)) 02095 keys_to_be_removed.insert(it->first); 02096 } 02097 02098 for (SetIterator it = keys_to_be_removed.begin(); it != keys_to_be_removed.end(); ++it) 02099 { 02100 sense_to_words.erase(*it); 02101 } 02102 } 02103 02104 bool WordNetOntology::isWordUnknown(string word) 02105 { 02106 return isWordUnknown(words_id[word]); 02107 } 02108 02109 bool WordNetOntology::isWordUnknown(int id) 02110 { 02111 bool is_unknown = true; 02112 for (SetIterator it = word_to_senses[id].begin(); it != word_to_senses[id].end(); ++it) 02113 { 02114 if (!synsets[*it]->is_unknown) 02115 is_unknown = false; 02116 } 02117 return is_unknown; 02118 } 02119 02120 bool WordNetOntology::isSynsetUnknown(int id) 02121 { 02122 return synsets[id]->is_unknown; 02123 } 02124 02125 // set<int> getWordCategoriesAtLevel(int id, int level) 02126 // { 02127 // set<int> categories; 02128 // for (set<int>::iterator it = word_to_senses.begin(); it != word_to_senses.end(); ++it) 02129 // { 02130 // int sense_id = *it; 02131 // Node* node = sense_to_ontology[sense_id]; 02132 // int cur_level = 0; 02133 // while (cur_level < level) 02134 // { 02135 02136 // } 02137 // } 02138 // } 02139 02140 // void WordNetOntology::getCategoriesAtLevel(int ss_id, int level, set<int>& categories) 02141 // { 02142 // Node* node = synsets[ss_id]; 02143 // if (node->level == level) 02144 // { 02145 // categories.insert(ss_id); 02146 // } else 02147 // { 02148 // for (SetIterator it = node->parents.begin(); it != node->parents.end(); ++it) 02149 // { 02150 // getCategoriesAtLevel(*it, level, categories); 02151 // } 02152 // } 02153 // } 02154 02155 void WordNetOntology::getDownToUpParentCategoriesAtLevel(int ss_id, int target_level, Set categories, int cur_level) 02156 { 02157 Node* node = synsets[ss_id]; 02158 if (cur_level == target_level && !isTopLevelCategory(ss_id)) 02159 { 02160 categories.insert(ss_id); 02161 } else 02162 { 02163 for (SetIterator it = node->parents.begin(); it != node->parents.end(); ++it) 02164 getDownToUpParentCategoriesAtLevel(*it, target_level, categories, cur_level + 1); 02165 } 02166 } 02167 02168 void WordNetOntology::getCategoriesAtLevel(int ss_id, int cur_level, int target_level, set<int>& categories) 02169 { 02170 Node* node = synsets[ss_id]; 02171 if (cur_level == target_level && !isTopLevelCategory(ss_id)) 02172 { 02173 categories.insert(ss_id); 02174 } else 02175 { 02176 for (SetIterator it = node->parents.begin(); it != node->parents.end(); ++it) 02177 { 02178 getCategoriesAtLevel(*it, cur_level + 1, target_level, categories); 02179 } 02180 } 02181 } 02182 02183 void WordNetOntology::getCategoriesUnderLevel(int ss_id, int cur_level, int target_level, Set categories) 02184 { 02185 Node* node = synsets[ss_id]; 02186 if (!isTopLevelCategory(ss_id)) 02187 categories.insert(ss_id); 02188 if (cur_level != target_level) 02189 { 02190 for (SetIterator it = node->parents.begin(); it != node->parents.end(); ++it) 02191 getCategoriesUnderLevel(*it, cur_level + 1, target_level, categories); 02192 } 02193 } 02194 02195 void WordNetOntology::reducePolysemy(int level) 02196 { 02197 ShellProgressBar progress(0, words.size() - 1, "reducing polysemy", 50); 02198 progress.init(); 02199 progress.draw(); 02200 int count = 0; 02201 for (map<int, string>::iterator it = words.begin(); it != words.end(); ++it) 02202 { 02203 int word_id = it->first; 02204 //reduceWordPolysemy(word_id, level); 02205 reduceWordPolysemy_preserveSenseOverlapping(word_id, level); 02206 progress.update(count++); 02207 } 02208 progress.done(); 02209 removeNonReachableSynsets(); 02210 } 02211 02212 // for every sense of a word, build a list of associated categories at the 02213 // nth ontological level, and use these to cluster the senses; we can keep only 02214 // a single sense in every clusters, thus performing a sense reduction 02215 void WordNetOntology::reduceWordPolysemy(int word_id, int level) 02216 { 02217 Set senses = word_to_senses[word_id]; 02218 Set senses_to_be_removed; 02219 if (senses.size() > 1) 02220 { 02221 //SetsValuesSet svs; 02222 set<set<int> > ss; 02223 for (SetIterator it = senses.begin(); it != senses.end(); ++it) 02224 { 02225 int sense_id = *it; 02226 set<int> categories_at_level; 02227 getCategoriesAtLevel(sense_id, 0, level, categories_at_level); 02228 02229 // cout << "sense_id = " << sense_id << ", categories_at_level[" << level << "] for word '" << words[word_id] << "' : "; 02230 // for (set<int>::iterator sit = categories_at_level.begin(); sit != categories_at_level.end(); ++sit) 02231 // cout << *sit << " "; 02232 02233 if (categories_at_level.size() != 0) 02234 { 02235 // if a list of categories, for a given sense, is already extracted 02236 // (through a different sense) mark the sense for deletion 02237 //bool already_there = !svs.insert(categories_at_level, sense_id); 02238 bool already_there = (ss.find(categories_at_level) != ss.end()); 02239 if (already_there) 02240 { 02241 //cout << "*" << endl; 02242 senses_to_be_removed.insert(sense_id); 02243 sense_to_words[sense_id].remove(word_id); 02244 // if a sense doesn't point to any word anymore, erase it from the sense table 02245 if (sense_to_words[sense_id].isEmpty()) 02246 sense_to_words.erase(sense_id); 02247 } else 02248 { 02249 ss.insert(categories_at_level); 02250 //cout << endl; 02251 } 02252 } else 02253 { 02254 //cout << endl; 02255 } 02256 } 02257 // erase the marked senses 02258 for (SetIterator it = senses_to_be_removed.begin(); it != senses_to_be_removed.end(); ++it) 02259 { 02260 int sense_id = *it; 02261 word_to_senses[word_id].remove(sense_id); 02262 word_to_noun_senses[word_id].remove(sense_id); 02263 word_to_verb_senses[word_id].remove(sense_id); 02264 word_to_adj_senses[word_id].remove(sense_id); 02265 word_to_adv_senses[word_id].remove(sense_id); 02266 } 02267 } 02268 } 02269 02270 void WordNetOntology::reduceWordPolysemy_preserveSenseOverlapping(int word_id, int level) 02271 { 02272 Set senses = word_to_senses[word_id]; 02273 Set senses_to_be_removed; 02274 map<set<int>, Set> categories_to_senses; 02275 if (senses.size() > 1) 02276 { 02277 for (SetIterator it = senses.begin(); it != senses.end(); ++it) 02278 { 02279 int sense_id = *it; 02280 set<int> categories_at_level; 02281 getCategoriesAtLevel(sense_id, 0, level, categories_at_level); 02282 if (categories_at_level.size() != 0) 02283 categories_to_senses[categories_at_level].insert(sense_id); 02284 } 02285 02286 for (map<set<int>, Set>::iterator it = categories_to_senses.begin(); it != categories_to_senses.end(); ++it) 02287 { 02288 Set sense_cluster = it->second; 02289 if (sense_cluster.size() > 1) 02290 { 02291 int sense_cluster_size = sense_cluster.size(); 02292 int n_sense_removed = 0; 02293 for (SetIterator sit = sense_cluster.begin(); sit != sense_cluster.end(); ++sit) 02294 { 02295 int sense_id = *sit; 02296 if (sense_to_words[sense_id].size() < 2 && n_sense_removed < (sense_cluster_size - 1)) 02297 { 02298 senses_to_be_removed.insert(sense_id); 02299 sense_to_words[sense_id].remove(word_id); 02300 // if a sense doesn't point to any word anymore, erase it from the sense table 02301 if (sense_to_words[sense_id].isEmpty()) 02302 sense_to_words.erase(sense_id); 02303 n_sense_removed++; 02304 } 02305 } 02306 } 02307 } 02308 02309 if (!senses_to_be_removed.isEmpty()) 02310 { 02311 cout << words[word_id] << endl; 02312 // cout << "senses = " << senses; 02313 // cout << ", senses_to_be_removed = " << senses_to_be_removed << endl; 02314 } 02315 02316 // erase the marked senses 02317 for (SetIterator it = senses_to_be_removed.begin(); it != senses_to_be_removed.end(); ++it) 02318 { 02319 int sense_id = *it; 02320 02321 printSynset(sense_id, 1); 02322 02323 word_to_senses[word_id].remove(sense_id); 02324 word_to_noun_senses[word_id].remove(sense_id); 02325 word_to_verb_senses[word_id].remove(sense_id); 02326 word_to_adj_senses[word_id].remove(sense_id); 02327 word_to_adv_senses[word_id].remove(sense_id); 02328 } 02329 } 02330 } 02331 02332 void WordNetOntology::reduceWordPolysemy_preserveSenseOverlapping2(int word_id, int level) 02333 { 02334 /* 02335 Set senses = word_to_senses[word_id]; 02336 Set senses_to_be_removed; 02337 map<int, Set> sense_to_categories_under_level(senses.size()); 02338 if (senses.size() > 1) 02339 { 02340 for (SetIterator it = senses.begin(); it != senses.end(); ++it) 02341 { 02342 int sense_id = *it; 02343 Set categories_under_level; 02344 getCategoriesUnderLevel(sense_id, 0, level, categories_under_level); 02345 sense_to_categories_under_level[sense_id] = categories_under_level; 02346 } 02347 02348 if (!senses_to_be_removed.isEmpty()) 02349 { 02350 //cout << words[word_id] << endl; 02351 //cout << "senses = " << senses; 02352 //cout << ", senses_to_be_removed = " << senses_to_be_removed << endl; 02353 } 02354 // erase the marked senses 02355 for (SetIterator it = senses_to_be_removed.begin(); it != senses_to_be_removed.end(); ++it) 02356 { 02357 int sense_id = *it; 02358 02359 printSynset(sense_id, 1); 02360 02361 word_to_senses[word_id].remove(sense_id); 02362 word_to_noun_senses[word_id].remove(sense_id); 02363 word_to_verb_senses[word_id].remove(sense_id); 02364 word_to_adj_senses[word_id].remove(sense_id); 02365 word_to_adv_senses[word_id].remove(sense_id); 02366 } 02367 } 02368 */ 02369 } 02370 02371 // remove all the synsets that are not accessible through the word table 02372 // direction is : words -> senses -> categories 02373 void WordNetOntology::removeNonReachableSynsets() 02374 { 02375 // visit the whole graph, beginning with words, and going upward, and marking the nodes 02376 for (map<int, Set>::iterator wit = word_to_senses.begin(); wit != word_to_senses.end(); ++wit) 02377 { 02378 Set senses = wit->second; 02379 for (SetIterator sit = senses.begin(); sit != senses.end(); ++sit) 02380 { 02381 int sense_id = *sit; 02382 visitUpward(synsets[sense_id]); 02383 } 02384 } 02385 // mark synsets that need to be removed 02386 Set synsets_to_be_removed; 02387 for (map<int, Node*>::iterator sit = synsets.begin(); sit != synsets.end(); ++sit) 02388 { 02389 int ss_id = sit->first; 02390 Node* node = sit->second; 02391 if (!node->visited) 02392 { 02393 synsets_to_be_removed.insert(ss_id); 02394 } else 02395 { 02396 // for a synset that does not need to be removed, check if there are child pointers 02397 // to a removed one (mark them for deletion if so) 02398 Set children_to_be_removed; 02399 for (SetIterator cit = node->children.begin(); cit != node->children.end(); ++cit) 02400 { 02401 int child_id = *cit; 02402 if (!synsets[child_id]->visited) 02403 children_to_be_removed.insert(child_id); 02404 } 02405 // remove the marked child pointers 02406 for (SetIterator rit = children_to_be_removed.begin(); rit != children_to_be_removed.end(); ++rit) 02407 node->children.remove(*rit); 02408 } 02409 } 02410 02411 // remove the marked synsets 02412 for (SetIterator rit = synsets_to_be_removed.begin(); rit != synsets_to_be_removed.end(); ++rit) 02413 { 02414 int ss_id = *rit; 02415 delete(synsets[ss_id]); 02416 synsets.erase(ss_id); 02417 } 02418 02419 unvisitAll(); 02420 } 02421 02422 void WordNetOntology::removeWord(int id) 02423 { 02424 string word_string = words[id]; 02425 words.erase(id); 02426 word_to_senses.erase(id); 02427 word_to_noun_senses.erase(id); 02428 word_to_verb_senses.erase(id); 02429 word_to_adj_senses.erase(id); 02430 word_to_adv_senses.erase(id); 02431 words_id.erase(word_string); 02432 word_to_noun_wnsn.erase(id); 02433 word_to_verb_wnsn.erase(id); 02434 word_to_adj_wnsn.erase(id); 02435 word_to_adv_wnsn.erase(id); 02436 word_to_predominent_pos.erase(id); 02437 // word_is_in_wn[id]=0 Should I do that ? 02438 word_to_high_level_senses.erase(id); 02439 } 02440 02441 02442 void WordNetOntology::visitUpward(Node* node) 02443 { 02444 node->visited = true; 02445 for (SetIterator pit = node->parents.begin(); pit != node->parents.end(); ++pit) 02446 { 02447 int parent_id = *pit; 02448 if (!synsets[parent_id]->visited) 02449 visitUpward(synsets[parent_id]); 02450 } 02451 } 02452 02453 void 02454 WordNetOntology::unvisitDownward(Node *node) 02455 { 02456 node->visited = false; 02457 for (SetIterator s_it = node->children.begin(); s_it != node->children.end(); ++s_it) { 02458 Node *child = synsets[*s_it]; 02459 if (child->visited) 02460 unvisitDownward(child); 02461 } 02462 } 02463 02464 void WordNetOntology::detectWordsWithoutOntology() 02465 { 02466 for (map<int, Set>::iterator it = word_to_senses.begin(); it != word_to_senses.end(); ++it) 02467 { 02468 int word_id = it->first; 02469 Set senses = it->second; 02470 if (senses.isEmpty()) 02471 PLWARNING("word %d (%s) has no attached ontology", word_id, words[word_id].c_str()); 02472 } 02473 } 02474 02475 int WordNetOntology::getMaxSynsetId() 02476 { 02477 return synsets.rbegin()->first; 02478 } 02479 02480 Set WordNetOntology::getSyntacticClassesForWord(int word_id) 02481 { 02482 #ifndef NOWARNING 02483 if (!isWord(word_id)) 02484 PLWARNING("asking for a non-word id (%d)", word_id); 02485 #endif 02486 Set syntactic_classes; 02487 Set senses = word_to_senses[word_id]; 02488 for (SetIterator it = senses.begin(); it != senses.end(); ++it) 02489 { 02490 Node* node = synsets[*it]; 02491 for (SetIterator tit = node->types.begin(); tit != node->types.end(); ++tit) 02492 syntactic_classes.insert(*tit); 02493 } 02494 return syntactic_classes; 02495 } 02496 02497 int WordNetOntology::getSyntacticClassForSense(int sense_id) 02498 { 02499 #ifndef NOWARNING 02500 if (!isSense(sense_id)) 02501 PLWARNING("asking for a non-sense id (%d)", sense_id); 02502 #endif 02503 Node* sense = synsets[sense_id]; 02504 if (sense->types.size() > 1) 02505 PLWARNING("a sense has more than 1 POS type"); 02506 int type = *(sense->types.begin()); 02507 return type; 02508 } 02509 02510 int WordNetOntology::getPredominentSyntacticClassForWord(int word_id) 02511 { 02512 #ifndef NOWARNING 02513 if (!isWord(word_id)) 02514 PLWARNING("asking for a non-word id (%d)", word_id); 02515 #endif 02516 if (are_predominent_pos_extracted) 02517 return word_to_predominent_pos[word_id]; 02518 int n_noun = 0; 02519 int n_verb = 0; 02520 int n_adj = 0; 02521 int n_adv = 0; 02522 Set senses = word_to_senses[word_id]; 02523 for (SetIterator it = senses.begin(); it != senses.end(); ++it) 02524 { 02525 int sense_id = *it; 02526 int type = getSyntacticClassForSense(sense_id); 02527 switch (type) 02528 { 02529 case NOUN_TYPE: 02530 n_noun++; 02531 break; 02532 case VERB_TYPE: 02533 n_verb++; 02534 break; 02535 case ADJ_TYPE: 02536 n_adj++; 02537 break; 02538 case ADV_TYPE: 02539 n_adv++; 02540 } 02541 } 02542 if (n_noun == 0 && n_verb == 0 && n_adj == 0 && n_adv == 0) 02543 return UNDEFINED_TYPE; 02544 else if (n_noun >= n_verb && n_noun >= n_adj && n_noun >= n_adv) 02545 return NOUN_TYPE; 02546 else if (n_verb >= n_noun && n_verb >= n_adj && n_verb >= n_adv) 02547 return VERB_TYPE; 02548 else if (n_adj >= n_noun && n_adj >= n_verb && n_adj >= n_adv) 02549 return ADJ_TYPE; 02550 else 02551 return ADV_TYPE; 02552 } 02553 02554 void WordNetOntology::extractPredominentSyntacticClasses() 02555 { 02556 for (map<int, Set>::iterator it = word_to_senses.begin(); it != word_to_senses.end(); ++it) 02557 { 02558 int word_id = it->first; 02559 word_to_predominent_pos[word_id] = getPredominentSyntacticClassForWord(word_id); 02560 } 02561 are_predominent_pos_extracted = true; 02562 } 02563 02564 void WordNetOntology::savePredominentSyntacticClasses(string file) 02565 { 02566 ofstream out_pos(file.c_str()); 02567 for (map<int, Set>::iterator it = word_to_senses.begin(); it != word_to_senses.end(); ++it) 02568 { 02569 int word_id = it->first; 02570 out_pos << getPredominentSyntacticClassForWord(word_id) << endl; 02571 02572 } 02573 out_pos.close(); 02574 } 02575 02576 void WordNetOntology::loadPredominentSyntacticClasses(string file) 02577 { 02578 ifstream in_pos(file.c_str()); 02579 int line_counter = 0; 02580 while (!in_pos.eof()) 02581 { 02582 string line = pgetline(in_pos); 02583 if (line == "") continue; 02584 int pos = toint(line); 02585 word_to_predominent_pos[line_counter++] = pos; 02586 } 02587 in_pos.close(); 02588 are_predominent_pos_extracted = true; 02589 } 02590 02591 bool WordNetOntology::isTopLevelCategory(int ss_id) 02592 { 02593 return (ss_id == ROOT_SS_ID || ss_id == SUPER_UNKNOWN_SS_ID || 02594 ss_id == NOUN_SS_ID || ss_id == VERB_SS_ID || 02595 ss_id == ADJ_SS_ID || ss_id == ADV_SS_ID || 02596 ss_id == OOV_SS_ID || ss_id == PROPER_NOUN_SS_ID || 02597 ss_id == NUMERIC_SS_ID || ss_id == PUNCTUATION_SS_ID || 02598 ss_id == STOP_SS_ID || ss_id == UNDEFINED_SS_ID || 02599 ss_id == BOS_SS_ID || ss_id == EOS_SS_ID); 02600 } 02601 02602 void WordNetOntology::getDescendantCategoriesAtLevel(int ss_id, int cur_level, int target_level, Set categories) 02603 { 02604 if (isSynset(ss_id)) 02605 { 02606 Node* node = synsets[ss_id]; 02607 02608 // WARNING: HERE IS A HUGE HACK!!! 02609 if (cur_level < target_level && isSense(ss_id)) 02610 { 02611 Set words = sense_to_words[ss_id]; 02612 for (SetIterator wit = words.begin(); wit != words.end(); ++wit) 02613 word_to_under_target_level_high_level_senses[*wit].insert(ss_id); 02614 } 02616 02617 if (cur_level == target_level) 02618 categories.insert(ss_id); 02619 else 02620 { 02621 for (SetIterator it = node->children.begin(); it != node->children.end(); ++it) 02622 getDescendantCategoriesAtLevel(*it, cur_level + 1, target_level, categories); 02623 } 02624 } 02625 } 02626 02627 void WordNetOntology::extractWordHighLevelSenses(int noun_depth, int verb_depth, int adj_depth, int adv_depth, int unk_depth) 02628 { 02629 Set noun_categories; 02630 getDescendantCategoriesAtLevel(NOUN_SS_ID, 0, noun_depth, noun_categories); 02631 for (SetIterator sit = noun_categories.begin(); sit != noun_categories.end(); ++sit) 02632 { 02633 int ss_id = *sit; 02634 Set word_descendants = getSynsetWordDescendants(ss_id); 02635 for (SetIterator wit = word_descendants.begin(); wit != word_descendants.end(); ++wit) 02636 { 02637 int word_id = *wit; 02638 word_to_high_level_senses[word_id].insert(ss_id); 02639 } 02640 } 02641 Set verb_categories; 02642 getDescendantCategoriesAtLevel(VERB_SS_ID, 0, verb_depth, verb_categories); 02643 for (SetIterator sit = verb_categories.begin(); sit != verb_categories.end(); ++sit) 02644 { 02645 int ss_id = *sit; 02646 Set word_descendants = getSynsetWordDescendants(ss_id); 02647 for (SetIterator wit = word_descendants.begin(); wit != word_descendants.end(); ++wit) 02648 { 02649 int word_id = *wit; 02650 word_to_high_level_senses[word_id].insert(ss_id); 02651 } 02652 } 02653 Set adj_categories; 02654 getDescendantCategoriesAtLevel(ADJ_SS_ID, 0, adj_depth, adj_categories); 02655 for (SetIterator sit = adj_categories.begin(); sit != adj_categories.end(); ++sit) 02656 { 02657 int ss_id = *sit; 02658 Set word_descendants = getSynsetWordDescendants(ss_id); 02659 for (SetIterator wit = word_descendants.begin(); wit != word_descendants.end(); ++wit) 02660 { 02661 int word_id = *wit; 02662 word_to_high_level_senses[word_id].insert(ss_id); 02663 } 02664 } 02665 Set adv_categories; 02666 getDescendantCategoriesAtLevel(ADV_SS_ID, 0, adv_depth, adv_categories); 02667 for (SetIterator sit = adv_categories.begin(); sit != adv_categories.end(); ++sit) 02668 { 02669 int ss_id = *sit; 02670 Set word_descendants = getSynsetWordDescendants(ss_id); 02671 for (SetIterator wit = word_descendants.begin(); wit != word_descendants.end(); ++wit) 02672 { 02673 int word_id = *wit; 02674 word_to_high_level_senses[word_id].insert(ss_id); 02675 } 02676 } 02677 Set unk_categories; 02678 getDescendantCategoriesAtLevel(SUPER_UNKNOWN_SS_ID, 0, unk_depth, unk_categories); 02679 for (SetIterator sit = unk_categories.begin(); sit != unk_categories.end(); ++sit) 02680 { 02681 int ss_id = *sit; 02682 Set word_descendants = getSynsetWordDescendants(ss_id); 02683 for (SetIterator wit = word_descendants.begin(); wit != word_descendants.end(); ++wit) 02684 { 02685 int word_id = *wit; 02686 word_to_high_level_senses[word_id].insert(ss_id); 02687 } 02688 } 02689 02690 // This role is deprecated: integrity verification : to each word should be assigned at least 1 high-level sense 02691 // The new role is now to assign "normal" senses to word that didn't get high-level senses 02692 for (map<int, string>::iterator it = words.begin(); it != words.end(); ++it) 02693 { 02694 int word_id = it->first; 02695 if (word_to_high_level_senses[word_id].size() == 0) 02696 word_to_high_level_senses[word_id] = word_to_senses[word_id]; 02697 // This is deprecated: PLWARNING("word '%s' (%d) has no high-level sense", words[word_id].c_str(), word_id); 02698 } 02699 02700 are_word_high_level_senses_extracted = true; 02701 } 02702 02703 void WordNetOntology::extractWordNounAndVerbHighLevelSenses(int noun_depth, int verb_depth) 02704 { 02705 for (map<int, string>::iterator it = words.begin(); it != words.end(); ++it) 02706 { 02707 int word_id = it->first; 02708 word_to_high_level_senses[word_id] = word_to_adv_senses[word_id]; 02709 word_to_high_level_senses[word_id].merge(word_to_adj_senses[word_id]); 02710 } 02711 02712 Set noun_categories; 02713 getDescendantCategoriesAtLevel(NOUN_SS_ID, 0, noun_depth, noun_categories); 02714 for (SetIterator sit = noun_categories.begin(); sit != noun_categories.end(); ++sit) 02715 { 02716 int ss_id = *sit; 02717 Set word_descendants = getSynsetWordDescendants(ss_id); 02718 for (SetIterator wit = word_descendants.begin(); wit != word_descendants.end(); ++wit) 02719 { 02720 int word_id = *wit; 02721 word_to_high_level_senses[word_id].insert(ss_id); 02722 } 02723 } 02724 Set verb_categories; 02725 getDescendantCategoriesAtLevel(VERB_SS_ID, 0, verb_depth, verb_categories); 02726 for (SetIterator sit = verb_categories.begin(); sit != verb_categories.end(); ++sit) 02727 { 02728 int ss_id = *sit; 02729 Set word_descendants = getSynsetWordDescendants(ss_id); 02730 for (SetIterator wit = word_descendants.begin(); wit != word_descendants.end(); ++wit) 02731 { 02732 int word_id = *wit; 02733 word_to_high_level_senses[word_id].insert(ss_id); 02734 } 02735 } 02736 02737 // BIG HACK!!! 02738 for (map<int, Set>::iterator it = word_to_under_target_level_high_level_senses.begin(); it != word_to_under_target_level_high_level_senses.end(); ++it) 02739 { 02740 word_to_high_level_senses[it->first].merge(it->second); 02741 } 02742 02743 for (map<int, string>::iterator it = words.begin(); it != words.end(); ++it) 02744 { 02745 int word_id = it->first; 02746 if (word_to_high_level_senses[word_id].size() == 0) 02747 word_to_high_level_senses[word_id] = word_to_senses[word_id]; 02748 } 02749 02750 are_word_high_level_senses_extracted = true; 02751 } 02752 02753 int WordNetOntology::getWordSenseUniqueId(int word, int sense) 02754 { 02755 if (!are_word_sense_unique_ids_computed) 02756 computeWordSenseUniqueIds(); 02757 pair<int, int> ws(word, sense); 02758 if (word_sense_to_unique_id.find(ws) == word_sense_to_unique_id.end()) 02759 return -1; 02760 return word_sense_to_unique_id[ws]; 02761 } 02762 02763 void WordNetOntology::computeWordSenseUniqueIds() 02764 { 02765 int unique_id = 0; 02766 for (map<int, Set>::iterator wit = word_to_senses.begin(); wit != word_to_senses.end(); ++wit) 02767 { 02768 int w = wit->first; 02769 Set senses = wit->second; 02770 for (SetIterator sit = senses.begin(); sit != senses.end(); ++sit) 02771 { 02772 int s = *sit; 02773 pair<int, int> ws(w, s); 02774 if (word_sense_to_unique_id.find(ws) != word_sense_to_unique_id.end()) 02775 PLERROR("in computeWordSenseUniqueIds(): dupe word/sense keys (w = %d, s = %d)", w, s); 02776 word_sense_to_unique_id[ws] = unique_id++; 02777 } 02778 } 02779 are_word_sense_unique_ids_computed = true; 02780 } 02781 02782 int WordNetOntology::getWordSenseUniqueIdSize() 02783 { 02784 if (!are_word_sense_unique_ids_computed) 02785 computeWordSenseUniqueIds(); 02786 return (int)word_sense_to_unique_id.size(); 02787 } 02788 02789 // {non-letters}word{non-letters} -> word 02790 string trimWord(string word) 02791 { 02792 // trim forward 02793 int index = 0; 02794 bool forward_trimmed = isLetter(word[index]) || isDigit(word[index]) || isLegalPunct(word[index]); 02795 while (!forward_trimmed) 02796 { 02797 index++; 02798 if (index > (int)word.size()) return NULL_TAG; 02799 forward_trimmed = isLetter(word[index]) || isDigit(word[index]) || isLegalPunct(word[index]); 02800 } 02801 02802 word = word.substr(index, word.size()); 02803 02804 // trim backward 02805 index = word.size() - 1; 02806 bool backward_trimmed = isLetter(word[index]) || isDigit(word[index]) || isLegalPunct(word[index]); 02807 while (!backward_trimmed) 02808 { 02809 index--; 02810 if (index < 0) return NULL_TAG; 02811 backward_trimmed = isLetter(word[index]) || isDigit(word[index]) || isLegalPunct(word[index]); 02812 } 02813 02814 string trimmed_word = word.substr(0, index + 1); 02815 02816 if (trimmed_word == ".") 02817 return NULL_TAG; 02818 else 02819 return trimmed_word; 02820 } 02821 02822 bool isLetter(char c) 02823 { 02824 return (c >= 65 && c <= 90) || (c >= 97 && c <= 122); 02825 } 02826 02827 bool isDigit(char c) 02828 { 02829 return (c >= 48 && c <= 57); 02830 } 02831 02832 bool isAlpha(char c) 02833 { 02834 return (isLetter(c) || isDigit(c)); 02835 } 02836 02837 bool isLegalPunct(char c) 02838 { 02839 return (c == '.' || c == '_'); 02840 } 02841 02842 string stemWord(string& word) 02843 { 02844 //char* input_word = const_cast<char*>(word.c_str()); 02845 char* input_word = cstr(word); 02846 char* lemma = morphword(input_word, NOUN); 02847 if (lemma == NULL) 02848 { 02849 lemma = morphword(input_word, VERB); 02850 if (lemma == NULL) 02851 { 02852 lemma = morphword(input_word, ADJ); 02853 if (lemma == NULL) 02854 { 02855 lemma = morphword(input_word, ADV); 02856 } 02857 } 02858 } 02859 if (lemma == NULL) 02860 { 02861 return word; 02862 } else 02863 { 02864 //cout << word << " -> " << lemma << endl; 02865 return string(lemma); 02866 } 02867 } 02868 02869 string stemWord(string& word, int wn_pos) 02870 { 02871 //char* input_word = const_cast<char*>(word.c_str()); 02872 char* input_word = cstr(word); 02873 char* lemma = morphword(input_word, wn_pos); 02874 if (lemma == NULL) 02875 return word; 02876 else 02877 return string(lemma); 02878 } 02879 02880 char* cstr(string& str) 02881 { 02882 char* cstr = new char[str.size() + 1]; 02883 for (unsigned int i = 0; i < str.size(); i++) 02884 *(cstr + i) = str[i]; 02885 cstr[str.size()] = '\0'; 02886 return cstr; 02887 } 02888 02889 void removeDelimiters(string& s, string delim, string replace) 02890 { 02891 unsigned int pos = s.find(delim, 0); 02892 while (pos != string::npos) 02893 { 02894 s.replace(pos, 1, replace); 02895 pos = s.find(delim, pos + 1); 02896 } 02897 } 02898 02899 bool startsWith(string& base, string s) 02900 { 02901 if (base.size() < s.size()) return false; 02902 for (unsigned int i = 0; i < s.size(); i++) 02903 if (base[i] != s[i]) return false; 02904 return true; 02905 } 02906 02907 void replaceChars(string& str, string char_to_replace, string replacing_char) 02908 { 02909 unsigned int pos = str.find(char_to_replace, 0); 02910 while (pos != string::npos) 02911 { 02912 str.replace(pos, 1, replacing_char); 02913 pos = str.find(char_to_replace, pos + 1); 02914 } 02915 } 02916 02917 } // namespace PLearn