PLearn: WordNetOntology.h Source File

00001 // -*- C++ -*- 00002 00003 // PLearn (A C++ Machine Learning Library) 00004 // Copyright (C) 1998 Pascal Vincent 00005 // Copyright (C) 1999,2000 Pascal Vincent, Yoshua Bengio and University of Montreal 00006 // 00007 // This file is part of the PLearn Library. This library is free 00008 // software; you can redistribute it and/or modify it under the 00009 // terms of the GNU General Public License as published by the 00010 // Free Software Foundation, version 2. 00011 // 00012 // This library is distributed in the hope that it will be useful, 00013 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00015 // GNU General Public License for more details. 00016 // 00017 // You should have received a copy of the GNU General Public License 00018 // along with this library; see the file GPL.txt If not, write to the Free 00019 // Software Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 00020 // 00021 // As a special exception, you may compile and link this library with files 00022 // not covered by the GNU General Public License, and distribute the resulting 00023 // executable file under the terms of your choice, without the requirement to 00024 // distribute the complete corresponding source code, provided you have 00025 // obtained explicit written permission to do so from Pascal Vincent (primary 00026 // author of the library) or Yoshua Bengio or the University of Montreal. 00027 // This exception does not however invalidate any other reasons why the 00028 // executable file might be covered by the GNU General Public License. 00029 // 00030 // See the following URL for more information on PLearn: 00031 // http://plearn.sourceforge.net 00032 00033 00034 00035 /* ******************************************************* 00036 * $Id: WordNetOntology.h,v 1.23 2004/07/21 16:30:57 chrish42 Exp $ 00037 * AUTHORS: Christian Jauvin 00038 * This file is part of the PLearn library. 00039 ******************************************************* */ 00040 00041 #ifndef WORD_NET_ONTOLOGY_H 00042 #define WORD_NET_ONTOLOGY_H 00043 00044 #include "wn.h" 00045 #include <plearn/base/general.h> 00046 #include <plearn_learners/language/Bitext/ShellProgressBar.h> 00047 #include "Set.h" 00048 00049 // #define NOUN_TYPE 1000 00050 // #define VERB_TYPE 1001 00051 // #define ADJ_TYPE 1002 00052 // #define ADV_TYPE 1003 00053 // #define UNDEFINED_TYPE 1004 00054 // #define ALL_WN_TYPE 1005 00055 // #define NUMERIC_TYPE 1006 00056 // #define PROPER_NOUN_TYPE 1007 00057 00058 #define NOUN_TYPE 1 00059 #define VERB_TYPE 2 00060 #define ADJ_TYPE 3 00061 #define ADV_TYPE 4 00062 #define ADJ_SATELLITE_TYPE 5 00063 #define ALL_WN_TYPE 6 00064 #define UNDEFINED_TYPE 7 00065 #define NUMERIC_TYPE 8 00066 #define PROPER_NOUN_TYPE 9 00067 00068 #define SYNSETTAG_ID -2 00069 00070 #define UNDEFINED_SS_ID -1 00071 #define ROOT_SS_ID 0 00072 #define SUPER_UNKNOWN_SS_ID 1 // 'unknown' means "out of WordNet" 00073 #define NOUN_SS_ID 2 00074 #define VERB_SS_ID 3 00075 #define ADJ_SS_ID 4 00076 #define ADV_SS_ID 5 00077 #define OOV_SS_ID 6 // out-of-vocabulary 00078 #define PROPER_NOUN_SS_ID 7 00079 #define NUMERIC_SS_ID 8 00080 #define PUNCTUATION_SS_ID 9 00081 #define STOP_SS_ID 10 00082 #define BOS_SS_ID 11 00083 #define EOS_SS_ID 12 00084 00085 #define UNDEFINED_OFFSET -1 00086 #define ROOT_OFFSET -2 00087 #define SUPER_UNKNOWN_OFFSET -3 // 'unknown' means "out of WordNet" 00088 #define NOUN_OFFSET -4 00089 #define VERB_OFFSET -5 00090 #define ADJ_OFFSET -6 00091 #define ADV_OFFSET -7 00092 #define OOV_OFFSET -8 // out-of-vocabulary 00093 #define PROPER_NOUN_OFFSET -9 00094 #define NUMERIC_OFFSET -10 00095 #define PUNCTUATION_OFFSET -11 00096 #define STOP_OFFSET -12 00097 #define BOS_OFFSET -13 00098 #define EOS_OFFSET -14 00099 00100 #define SUPER_FNUM -1 00101 00102 #define NULL_TAG "<null>" 00103 00104 #define OOV_TAG "<oov>" 00105 #define PROPER_NOUN_TAG "<proper_noun>" 00106 #define NUMERIC_TAG "<numeric>" 00107 #define PUNCTUATION_TAG "<punctuation>" 00108 #define STOP_TAG "<stop>" 00109 #define BOS_TAG "<s>" 00110 #define EOS_TAG "</s>" 00111 00112 #define VERB_TAG "<verb>" 00113 #define NOUN_TAG "<noun>" 00114 #define ADJ_TAG "<adj>" 00115 #define ADV_TAG "<adv>" 00116 #define UNDEFINED_TAG "<undefined>" 00117 00118 #define WNO_ERROR -1000 00119 00120 #define WORD_COVERAGE_THRESHOLD 10 00121 00122 // terminology : 00123 // word 00124 // sense : word meaning 00125 // category : concepts (forming an ontology DAG) 00126 // synset : sense U category 00127 // 00128 namespace PLearn { 00129 00130 // utils 00131 string trimWord(string word); 00132 string stemWord(string& word); // call to WN morphword() 00133 string stemWord(string& word, int wn_pos); 00134 bool isLetter(char c); 00135 bool isDigit(char c); 00136 bool isAlpha(char c); 00137 bool isLegalPunct(char c); 00138 char* cstr(string& s); 00139 void removeDelimiters(string& s, string delim, string replace); 00140 bool startsWith(string& base, string s); 00141 void replaceChars(string& str, string char_to_replace, string replacing_char); 00142 00143 // ontology DAG node 00144 00145 struct Node 00146 { 00147 Node() { ss_id = UNDEFINED_SS_ID; is_unknown = true; visited = false; fnum = SUPER_FNUM; hereiam = 0;} 00148 Node(int id) { ss_id = id; is_unknown = true; visited = false; fnum = SUPER_FNUM; hereiam = 0;} 00149 int ss_id; 00150 Set types; 00151 string gloss; 00152 vector<string> syns; 00153 Set parents; 00154 Set children; 00155 bool is_unknown; 00156 //int level; 00157 bool visited; 00158 long hereiam; 00159 int fnum; 00160 }; 00161 00162 class WordNetOntology 00163 { 00164 00165 protected: 00166 00167 // main ontology structure access points 00168 map<int, Set> word_to_senses; 00169 map<int, Set> word_to_noun_senses; 00170 map<int, Set> word_to_verb_senses; 00171 map<int, Set> word_to_adj_senses; 00172 map<int, Set> word_to_adv_senses; 00173 map<int, Set> sense_to_words; 00174 map<int, Set> synset_to_ancestors; 00175 map<int, Set> word_to_ancestors; 00176 map<int, Set> synset_to_sense_descendants; 00177 map<int, Set> synset_to_word_descendants; 00178 map<int, Node*> synsets; 00179 map<int, string> words; 00180 map<string, int> words_id; 00181 map<int, vector<int> > word_to_noun_wnsn; 00182 map<int, vector<int> > word_to_verb_wnsn; 00183 map<int, vector<int> > word_to_adj_wnsn; 00184 map<int, vector<int> > word_to_adv_wnsn; 00185 map<int, int> word_to_predominent_pos; 00186 map<int, bool> word_is_in_wn; 00187 map<int, Set> word_to_high_level_senses; 00188 map<pair<int, int>, int> word_sense_to_unique_id; 00189 map<int, Set> word_to_under_target_level_high_level_senses; // BIG HACK!!! 00190 map< pair<int, string>,int> sense_key_to_ss_id; 00191 map<pair<int,int>, string> ws_id_to_sense_key; 00192 00193 int word_index; // unique id for words 00194 int synset_index; // unique id for synsets 00195 int unknown_sense_index; // unique id for unknown senses 00196 00197 // stats 00198 int noun_count; 00199 int verb_count; 00200 int adj_count ; 00201 int adv_count; 00202 00203 int noun_sense_count; 00204 int verb_sense_count; 00205 int adj_sense_count; 00206 int adv_sense_count; 00207 00208 int in_wn_word_count; 00209 int out_of_wn_word_count; 00210 00211 // these flags are set to 'true' when the corresponding data is pre-computed 00212 bool are_ancestors_extracted; 00213 bool are_descendants_extracted; 00214 bool are_predominent_pos_extracted; 00215 bool are_word_high_level_senses_extracted; 00216 bool are_word_sense_unique_ids_computed; 00217 00218 int n_word_high_level_senses; 00219 00220 // If 'differentiate_unknown_words' is set to 'true', all the unknown words (words that are 00221 // out of WordNet) will be mapped to DIFFERENT synsets (senses), that are all going to be linked 00222 // to a SUPER-UNKNOWN synset node (a direct child of the ROOT synset). If it is set to 'false', 00223 // all the unknown words will be mapped to a single synset, SUPER-UNKNOWN, acting in this 00224 // context as a SENSE, and a direct child of the ROOT synset. 00225 bool differentiate_unknown_words; 00226 00227 public: 00228 00229 WordNetOntology(); // simply init the system, and load an ontology later 00230 00231 WordNetOntology(string voc_file, // build a new ontology, given a voc file 00232 bool differentiate_unknown_words, 00233 bool pre_compute_ancestors, 00234 bool pre_compute_descendants, 00235 int wn_pos_type = ALL_WN_TYPE, 00236 int word_coverage_threshold = -1); 00237 00238 WordNetOntology(string voc_file, // init the system and load an ontology, 00239 string synset_file, // given a voc file, a synset file and an ontology file 00240 string ontology_file, 00241 bool pre_compute_ancestors, 00242 bool pre_compute_descendants, 00243 int word_coverage_threshold = -1); 00244 00245 WordNetOntology(string voc_file, // init the system and load an ontology, 00246 string synset_file, // given a voc file, a synset file and an ontology file 00247 string ontology_file, 00248 string sense_key_file, 00249 bool pre_compute_ancestors, 00250 bool pre_compute_descendants, 00251 int word_coverage_threshold = -1); 00252 00253 void save(string synset_file, string ontology_file); 00254 void save(string voc_file); 00255 void saveVocInWordnet(string voc_file); 00256 void save(string synset_file, string ontology_file, string sense_key_file); 00257 void load(string voc_file, string synset_file, string ontology_file); 00258 void load(string voc_file, string synset_file, string ontology_file, string sense_key_file); 00259 void savePredominentSyntacticClasses(string file); 00260 void loadPredominentSyntacticClasses(string file); 00261 00262 // main access methods 00263 string getSenseKey(int word_id, int ss_id); 00264 int getSynsetIDForSenseKey(int word_id, string sense_key); 00265 int getWordId(string word); 00266 string getWord(int id); 00267 int getWordSenseIdForWnsn(string word, int wn_pos_type, int wnsn); 00268 int getWordSenseIdForSenseKey(string lemma, string lexsn, string word); 00269 int getWordSenseUniqueId(int word, int sense); 00270 int getWordSenseUniqueIdSize(); 00271 Set getWordSenses(int id); 00272 Set getWordHighLevelSenses(int id); 00273 Set getWordNounSenses(int id); 00274 Set getWordVerbSenses(int id); 00275 Set getWordAdjSenses(int id); 00276 Set getWordAdvSenses(int id); 00277 Set getWordsForSense(int id); 00278 Set getSynsetAncestors(int id, int max_level = -1); 00279 Set getSynsetParents(int id); 00280 Set getWordAncestors(int id, int max_level = -1); 00281 Set getSynsetSenseDescendants(int id); 00282 Set getSynsetWordDescendants(int id); 00283 Node* getSynset(int id); 00284 Node* getRootSynset() { return synsets[ROOT_SS_ID]; } 00285 Set getAllWords(); 00286 Set getAllSenses(); 00287 Set getAllCategories(); 00288 int getVocSize() { return words.size(); } 00289 int getSenseSize() { return sense_to_words.size(); } 00290 int getSynsetSize() { return synsets.size(); } 00291 int getMaxSynsetId(); 00292 Set getSyntacticClassesForWord(int word_id); 00293 int getSyntacticClassForSense(int sense_id); 00294 int getPredominentSyntacticClassForWord(int word_id); 00295 void getDescendantCategoriesAtLevel(int ss_id, int cur_level, int target_level, Set categories); 00296 void getDownToUpParentCategoriesAtLevel(int ss_id, int target_level, Set categories, int cur_level = 0); 00297 00298 00299 bool isWord(int id); 00300 bool isWord(string word); 00301 bool isSense(int id); // is a correct sens id 00302 bool isPureSense(int id);// is a synset but not a category 00303 bool isCategory(int id); // = isSynset 00304 bool isPureCategory(int id);// is a synset but not a sense 00305 bool isSynset(int id); // is a synset (sense or category) 00306 bool isWordUnknown(string word); 00307 bool isWordUnknown(int id); 00308 bool isSynsetUnknown(int id); 00309 bool isInWordNet(string word, bool trim_word = true, bool stem_word = true, bool remove_undescores = false); 00310 bool isInWordNet(int word_id); 00311 bool hasSenseInWordNet(string word, int wn_pos_type); 00312 bool isTopLevelCategory(int ss_id); 00313 bool containsWord(string word) { return (words_id.find(word) != words_id.end()); } 00314 bool containsWordId(int id) { return (words.find(id) != words.end()); } 00315 00316 Node *findSynsetFromSynsAndGloss(const vector<string> &syns, const string &gloss, const long offset, const int fnum); 00317 void removeNonReachableSynsets(); 00318 void removeWord(int id); 00319 00320 void print(bool print_ontology = true); 00321 void printSynset(int ss_id, int indent_level = 0); 00322 void printSynset(int ss_id, ostream& sout, int indent_level = 0); 00323 void printStats(); 00324 void printSynsetAncestors(); 00325 void printWordAncestors(); 00326 void printDescendants(); 00327 void printNodes(); 00328 void printWordOntology(int id); 00329 void printWordOntology(string word); 00330 void printInvertedSynsetOntology(int id, int level = 0); 00331 00332 int overlappingSynsets(int ss_id1, int ss_id2); 00333 bool areOverlappingSynsets(int ss_id1, int ss_id2) { return (overlappingSynsets(ss_id1, ss_id2) > 1); } 00334 void intersectAncestorsAndSenses(Set categories, Set senses); 00335 void reducePolysemy(int level); 00336 void extractPredominentSyntacticClasses(); 00337 void extractWordHighLevelSenses(int noun_depth, int verb_depth, int adj_depth, int adv_depth, int unk_depth); 00338 void extractWordNounAndVerbHighLevelSenses(int noun_depth, int verb_depth); 00339 00340 // integrity verifications 00341 void detectWordsWithoutOntology(); 00342 void lookForSpecialTags(); 00343 00344 void extract(string voc_file, int wn_pos_type); 00345 void extractWord(string original_word, int wn_pos_type, bool trim_word, bool stem_word, bool remove_underscores); 00346 bool extractSenses(string original_word, string processed_word, int wn_pos_type); 00347 void extractTaggedWordFrequencies(map<int, map<int, int> > &word_senses_to_tagged_frequencies); 00348 //int extractFrequencies(string word, int sense, int wn_pos_type); 00349 Node* extractOntology(SynsetPtr ssp); 00350 void extractAncestors(int threshold, bool cut_with_word_coverage, bool exclude_itself); 00351 void extractAncestors(Node* node, Set ancestors, int level, int level_threshold); 00352 void extractAncestors(Node* node, Set ancestors, int word_coverage_threshold); 00353 void extractDescendants(Node* node, Set sense_descendants, Set word_descendants); 00354 void extractStrictDescendants(Node* node, Set sense_descendants, Set word_descendants); 00355 void extractDescendants(); 00356 void computeWordSenseUniqueIds(); 00357 void init(bool differentiate_unknown_words = true); 00358 void createBaseSynsets(); 00359 void processUnknownWord(int word_id); 00360 void finalize(); 00361 void propagatePOSTypes(); 00362 void propagatePOSTypes(Node* node); 00363 void linkUpperCategories(); 00364 //void setLevels(); 00365 //void setLevels(int ss_id, int level); 00366 Node* checkForAlreadyExtractedSynset(SynsetPtr ssp); 00367 vector<string> getSynsetWords(SynsetPtr ssp); 00368 bool catchSpecialTags(string word); 00369 void reduceWordPolysemy(int word_id, int level); 00370 void reduceWordPolysemy_preserveSenseOverlapping(int word_id, int level); 00371 void reduceWordPolysemy_preserveSenseOverlapping2(int word_id, int level); 00372 //void getCategoriesAtLevel(int ss_id, int level, set<int>& categories); 00373 void getCategoriesAtLevel(int ss_id, int cur_level, int target_level, set<int>& categories); 00374 void getCategoriesUnderLevel(int ss_id, int cur_level, int target_level, Set categories); 00375 void visitUpward(Node* node); 00376 void unvisitDownward(Node *node); 00377 void unvisitAll(); 00378 void printOntology(Node* node, int level = 0); 00379 00380 // ATTENTION: il y a un systeme de mapping word->senses temporaire 00381 // et base sur TVec<int>, qui sert uniquement dans le contexte de la WSD 00382 // (SemiSupervisedSparseDataNeuralNet). Le but a moyen terme est de remplacer 00383 // tous les mappings bases sur Set dans WordNetOntology pour les faire reposer 00384 // sur TVec<int>, plus PLearn-compliant. 00385 map<int, TVec<int> > temp_word_to_senses; 00386 map<int, TVec<int> > temp_word_to_noun_senses; 00387 map<int, TVec<int> > temp_word_to_verb_senses; 00388 map<int, TVec<int> > temp_word_to_adj_senses; 00389 map<int, TVec<int> > temp_word_to_adv_senses; 00390 map<int, TVec<int> > temp_word_to_high_level_senses; 00391 00392 void fillTempWordToSensesTVecMap() 00393 { 00394 for (map<int, Set>::iterator it = word_to_senses.begin(); it != word_to_senses.end(); ++it) 00395 { 00396 int w = it->first; 00397 Set senses = it->second; 00398 for (SetIterator sit = senses.begin(); sit != senses.end(); ++sit) 00399 temp_word_to_senses[w].push_back(*sit); 00400 } 00401 00402 for (map<int, Set>::iterator it = word_to_noun_senses.begin(); it != word_to_noun_senses.end(); ++it) 00403 { 00404 int w = it->first; 00405 Set senses = it->second; 00406 for (SetIterator sit = senses.begin(); sit != senses.end(); ++sit) 00407 temp_word_to_noun_senses[w].push_back(*sit); 00408 } 00409 00410 for (map<int, Set>::iterator it = word_to_verb_senses.begin(); it != word_to_verb_senses.end(); ++it) 00411 { 00412 int w = it->first; 00413 Set senses = it->second; 00414 for (SetIterator sit = senses.begin(); sit != senses.end(); ++sit) 00415 temp_word_to_verb_senses[w].push_back(*sit); 00416 } 00417 00418 for (map<int, Set>::iterator it = word_to_adj_senses.begin(); it != word_to_adj_senses.end(); ++it) 00419 { 00420 int w = it->first; 00421 Set senses = it->second; 00422 for (SetIterator sit = senses.begin(); sit != senses.end(); ++sit) 00423 temp_word_to_adj_senses[w].push_back(*sit); 00424 } 00425 00426 for (map<int, Set>::iterator it = word_to_adv_senses.begin(); it != word_to_adv_senses.end(); ++it) 00427 { 00428 int w = it->first; 00429 Set senses = it->second; 00430 for (SetIterator sit = senses.begin(); sit != senses.end(); ++sit) 00431 temp_word_to_adv_senses[w].push_back(*sit); 00432 } 00433 } 00434 00435 TVec<int> getSensesForWord(int w) { return temp_word_to_senses[w]; } 00436 00437 void fillTempWordToHighLevelSensesTVecMap() 00438 { 00439 for (map<int, string>::iterator it = words.begin(); it != words.end(); ++it) 00440 { 00441 int w = it->first; 00442 Set hl_senses = getWordHighLevelSenses(w); 00443 for (SetIterator sit = hl_senses.begin(); sit != hl_senses.end(); ++sit) 00444 temp_word_to_high_level_senses[w].push_back(*sit); 00445 } 00446 } 00447 TVec<int> getHighLevelSensesForWord(int w) { return temp_word_to_high_level_senses[w]; } 00448 00449 TVec<int> getSecondLevelSensesForWord(int w) 00450 { 00451 Set sl_senses; 00452 Set senses = word_to_senses[w]; 00453 for (SetIterator sit = senses.begin(); sit != senses.end(); ++sit) 00454 { 00455 int s = *sit; 00456 Node* node = synsets[s]; 00457 for (SetIterator ssit = node->parents.begin(); ssit != node->parents.end(); ++ssit) 00458 { 00459 sl_senses.insert(*ssit); 00460 } 00461 } 00462 TVec<int> sl_senses_vec; 00463 for (SetIterator slit = sl_senses.begin(); slit != sl_senses.end(); ++slit) 00464 sl_senses_vec.push_back(*slit); 00465 return sl_senses_vec; 00466 } 00467 00468 TVec<int> getThirdLevelSensesForWord(int w) 00469 { 00470 Set tl_senses; 00471 Set senses = word_to_senses[w]; 00472 for (SetIterator sit = senses.begin(); sit != senses.end(); ++sit) 00473 { 00474 int s = *sit; 00475 Node* node = synsets[s]; 00476 for (SetIterator slit = node->parents.begin(); slit != node->parents.end(); ++slit) 00477 { 00478 int sl_sense = *slit; 00479 Node* node = synsets[sl_sense]; 00480 for (SetIterator tlit = node->parents.begin(); tlit != node->parents.end(); ++tlit) 00481 { 00482 tl_senses.insert(*tlit); 00483 } 00484 } 00485 } 00486 TVec<int> tl_senses_vec; 00487 for (SetIterator tlit = tl_senses.begin(); tlit != tl_senses.end(); ++tlit) 00488 tl_senses_vec.push_back(*tlit); 00489 return tl_senses_vec; 00490 } 00491 00492 }; 00493 00494 } 00495 00496 #endif