00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
#ifndef WORD_NET_ONTOLOGY_H
00042
#define WORD_NET_ONTOLOGY_H
00043
00044
#include "wn.h"
00045
#include <plearn/base/general.h>
00046
#include <plearn_learners/language/Bitext/ShellProgressBar.h>
00047
#include "Set.h"
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058 #define NOUN_TYPE 1
00059 #define VERB_TYPE 2
00060 #define ADJ_TYPE 3
00061 #define ADV_TYPE 4
00062 #define ADJ_SATELLITE_TYPE 5
00063 #define ALL_WN_TYPE 6
00064 #define UNDEFINED_TYPE 7
00065 #define NUMERIC_TYPE 8
00066 #define PROPER_NOUN_TYPE 9
00067
00068 #define SYNSETTAG_ID -2
00069
00070 #define UNDEFINED_SS_ID -1
00071 #define ROOT_SS_ID 0
00072 #define SUPER_UNKNOWN_SS_ID 1 // 'unknown' means "out of WordNet"
00073 #define NOUN_SS_ID 2
00074 #define VERB_SS_ID 3
00075 #define ADJ_SS_ID 4
00076 #define ADV_SS_ID 5
00077 #define OOV_SS_ID 6 // out-of-vocabulary
00078 #define PROPER_NOUN_SS_ID 7
00079 #define NUMERIC_SS_ID 8
00080 #define PUNCTUATION_SS_ID 9
00081 #define STOP_SS_ID 10
00082 #define BOS_SS_ID 11
00083 #define EOS_SS_ID 12
00084
00085 #define UNDEFINED_OFFSET -1
00086 #define ROOT_OFFSET -2
00087 #define SUPER_UNKNOWN_OFFSET -3 // 'unknown' means "out of WordNet"
00088 #define NOUN_OFFSET -4
00089 #define VERB_OFFSET -5
00090 #define ADJ_OFFSET -6
00091 #define ADV_OFFSET -7
00092 #define OOV_OFFSET -8 // out-of-vocabulary
00093 #define PROPER_NOUN_OFFSET -9
00094 #define NUMERIC_OFFSET -10
00095 #define PUNCTUATION_OFFSET -11
00096 #define STOP_OFFSET -12
00097 #define BOS_OFFSET -13
00098 #define EOS_OFFSET -14
00099
00100 #define SUPER_FNUM -1
00101
00102 #define NULL_TAG "<null>"
00103
00104 #define OOV_TAG "<oov>"
00105 #define PROPER_NOUN_TAG "<proper_noun>"
00106 #define NUMERIC_TAG "<numeric>"
00107 #define PUNCTUATION_TAG "<punctuation>"
00108 #define STOP_TAG "<stop>"
00109 #define BOS_TAG "<s>"
00110 #define EOS_TAG "</s>"
00111
00112 #define VERB_TAG "<verb>"
00113 #define NOUN_TAG "<noun>"
00114 #define ADJ_TAG "<adj>"
00115 #define ADV_TAG "<adv>"
00116 #define UNDEFINED_TAG "<undefined>"
00117
00118 #define WNO_ERROR -1000
00119
00120 #define WORD_COVERAGE_THRESHOLD 10
00121
00122
00123
00124
00125
00126
00127
00128
namespace PLearn {
00129
00130
00131
string trimWord(
string word);
00132
string stemWord(
string& word);
00133
string stemWord(
string& word,
int wn_pos);
00134
bool isLetter(
char c);
00135
bool isDigit(
char c);
00136
bool isAlpha(
char c);
00137
bool isLegalPunct(
char c);
00138
char*
cstr(
string& s);
00139
void removeDelimiters(
string& s,
string delim,
string replace);
00140
bool startsWith(
string& base,
string s);
00141
void replaceChars(
string& str,
string char_to_replace,
string replacing_char);
00142
00143
00144
00145 struct Node
00146 {
00147 Node() {
ss_id =
UNDEFINED_SS_ID;
is_unknown =
true;
visited =
false;
fnum =
SUPER_FNUM;
hereiam = 0;}
00148 Node(
int id) {
ss_id =
id;
is_unknown =
true;
visited =
false;
fnum =
SUPER_FNUM;
hereiam = 0;}
00149 int ss_id;
00150 Set types;
00151 string gloss;
00152 vector<string> syns;
00153 Set parents;
00154 Set children;
00155 bool is_unknown;
00156
00157 bool visited;
00158 long hereiam;
00159 int fnum;
00160 };
00161
00162 class WordNetOntology
00163 {
00164
00165
protected:
00166
00167
00168 map<int, Set>
word_to_senses;
00169 map<int, Set>
word_to_noun_senses;
00170 map<int, Set>
word_to_verb_senses;
00171 map<int, Set>
word_to_adj_senses;
00172 map<int, Set>
word_to_adv_senses;
00173 map<int, Set>
sense_to_words;
00174 map<int, Set>
synset_to_ancestors;
00175 map<int, Set>
word_to_ancestors;
00176 map<int, Set>
synset_to_sense_descendants;
00177 map<int, Set>
synset_to_word_descendants;
00178 map<int, Node*>
synsets;
00179 map<int, string>
words;
00180 map<string, int>
words_id;
00181 map<int, vector<int> >
word_to_noun_wnsn;
00182 map<int, vector<int> >
word_to_verb_wnsn;
00183 map<int, vector<int> >
word_to_adj_wnsn;
00184 map<int, vector<int> >
word_to_adv_wnsn;
00185 map<int, int>
word_to_predominent_pos;
00186 map<int, bool>
word_is_in_wn;
00187 map<int, Set>
word_to_high_level_senses;
00188 map<pair<int, int>,
int>
word_sense_to_unique_id;
00189 map<int, Set>
word_to_under_target_level_high_level_senses;
00190 map< pair<int, string>,
int>
sense_key_to_ss_id;
00191 map<pair<int,int>,
string>
ws_id_to_sense_key;
00192
00193 int word_index;
00194 int synset_index;
00195 int unknown_sense_index;
00196
00197
00198 int noun_count;
00199 int verb_count;
00200 int adj_count ;
00201 int adv_count;
00202
00203 int noun_sense_count;
00204 int verb_sense_count;
00205 int adj_sense_count;
00206 int adv_sense_count;
00207
00208 int in_wn_word_count;
00209 int out_of_wn_word_count;
00210
00211
00212 bool are_ancestors_extracted;
00213 bool are_descendants_extracted;
00214 bool are_predominent_pos_extracted;
00215 bool are_word_high_level_senses_extracted;
00216 bool are_word_sense_unique_ids_computed;
00217
00218 int n_word_high_level_senses;
00219
00220
00221
00222
00223
00224
00225 bool differentiate_unknown_words;
00226
00227
public:
00228
00229
WordNetOntology();
00230
00231
WordNetOntology(
string voc_file,
00232
bool differentiate_unknown_words,
00233
bool pre_compute_ancestors,
00234
bool pre_compute_descendants,
00235
int wn_pos_type = ALL_WN_TYPE,
00236
int word_coverage_threshold = -1);
00237
00238
WordNetOntology(
string voc_file,
00239
string synset_file,
00240
string ontology_file,
00241
bool pre_compute_ancestors,
00242
bool pre_compute_descendants,
00243
int word_coverage_threshold = -1);
00244
00245
WordNetOntology(
string voc_file,
00246
string synset_file,
00247
string ontology_file,
00248
string sense_key_file,
00249
bool pre_compute_ancestors,
00250
bool pre_compute_descendants,
00251
int word_coverage_threshold = -1);
00252
00253
void save(
string synset_file,
string ontology_file);
00254
void save(
string voc_file);
00255
void saveVocInWordnet(
string voc_file);
00256
void save(
string synset_file,
string ontology_file,
string sense_key_file);
00257
void load(
string voc_file,
string synset_file,
string ontology_file);
00258
void load(
string voc_file,
string synset_file,
string ontology_file,
string sense_key_file);
00259
void savePredominentSyntacticClasses(
string file);
00260
void loadPredominentSyntacticClasses(
string file);
00261
00262
00263
string getSenseKey(
int word_id,
int ss_id);
00264
int getSynsetIDForSenseKey(
int word_id,
string sense_key);
00265
int getWordId(
string word);
00266
string getWord(
int id);
00267
int getWordSenseIdForWnsn(
string word,
int wn_pos_type,
int wnsn);
00268
int getWordSenseIdForSenseKey(
string lemma,
string lexsn,
string word);
00269
int getWordSenseUniqueId(
int word,
int sense);
00270
int getWordSenseUniqueIdSize();
00271
Set getWordSenses(
int id);
00272
Set getWordHighLevelSenses(
int id);
00273
Set getWordNounSenses(
int id);
00274
Set getWordVerbSenses(
int id);
00275
Set getWordAdjSenses(
int id);
00276
Set getWordAdvSenses(
int id);
00277
Set getWordsForSense(
int id);
00278
Set getSynsetAncestors(
int id,
int max_level = -1);
00279
Set getSynsetParents(
int id);
00280
Set getWordAncestors(
int id,
int max_level = -1);
00281
Set getSynsetSenseDescendants(
int id);
00282
Set getSynsetWordDescendants(
int id);
00283
Node*
getSynset(
int id);
00284 Node*
getRootSynset() {
return synsets[
ROOT_SS_ID]; }
00285
Set getAllWords();
00286
Set getAllSenses();
00287
Set getAllCategories();
00288 int getVocSize() {
return words.size(); }
00289 int getSenseSize() {
return sense_to_words.size(); }
00290 int getSynsetSize() {
return synsets.size(); }
00291
int getMaxSynsetId();
00292
Set getSyntacticClassesForWord(
int word_id);
00293
int getSyntacticClassForSense(
int sense_id);
00294
int getPredominentSyntacticClassForWord(
int word_id);
00295
void getDescendantCategoriesAtLevel(
int ss_id,
int cur_level,
int target_level,
Set categories);
00296
void getDownToUpParentCategoriesAtLevel(
int ss_id,
int target_level,
Set categories,
int cur_level = 0);
00297
00298
00299
bool isWord(
int id);
00300
bool isWord(
string word);
00301
bool isSense(
int id);
00302
bool isPureSense(
int id);
00303
bool isCategory(
int id);
00304
bool isPureCategory(
int id);
00305
bool isSynset(
int id);
00306
bool isWordUnknown(
string word);
00307
bool isWordUnknown(
int id);
00308
bool isSynsetUnknown(
int id);
00309
bool isInWordNet(
string word,
bool trim_word =
true,
bool stem_word =
true,
bool remove_undescores =
false);
00310
bool isInWordNet(
int word_id);
00311
bool hasSenseInWordNet(
string word,
int wn_pos_type);
00312
bool isTopLevelCategory(
int ss_id);
00313 bool containsWord(
string word) {
return (
words_id.find(word) !=
words_id.end()); }
00314 bool containsWordId(
int id) {
return (
words.find(
id) !=
words.end()); }
00315
00316
Node *findSynsetFromSynsAndGloss(
const vector<string> &syns,
const string &gloss,
const long offset,
const int fnum);
00317
void removeNonReachableSynsets();
00318
void removeWord(
int id);
00319
00320
void print(
bool print_ontology =
true);
00321
void printSynset(
int ss_id,
int indent_level = 0);
00322
void printSynset(
int ss_id, ostream& sout,
int indent_level = 0);
00323
void printStats();
00324
void printSynsetAncestors();
00325
void printWordAncestors();
00326
void printDescendants();
00327
void printNodes();
00328
void printWordOntology(
int id);
00329
void printWordOntology(
string word);
00330
void printInvertedSynsetOntology(
int id,
int level = 0);
00331
00332
int overlappingSynsets(
int ss_id1,
int ss_id2);
00333 bool areOverlappingSynsets(
int ss_id1,
int ss_id2) {
return (overlappingSynsets(ss_id1, ss_id2) > 1); }
00334
void intersectAncestorsAndSenses(
Set categories,
Set senses);
00335
void reducePolysemy(
int level);
00336
void extractPredominentSyntacticClasses();
00337
void extractWordHighLevelSenses(
int noun_depth,
int verb_depth,
int adj_depth,
int adv_depth,
int unk_depth);
00338
void extractWordNounAndVerbHighLevelSenses(
int noun_depth,
int verb_depth);
00339
00340
00341
void detectWordsWithoutOntology();
00342
void lookForSpecialTags();
00343
00344
void extract(
string voc_file,
int wn_pos_type);
00345
void extractWord(
string original_word,
int wn_pos_type,
bool trim_word,
bool stem_word,
bool remove_underscores);
00346
bool extractSenses(
string original_word,
string processed_word,
int wn_pos_type);
00347
void extractTaggedWordFrequencies(map<
int, map<int, int> > &word_senses_to_tagged_frequencies);
00348
00349
Node* extractOntology(SynsetPtr ssp);
00350
void extractAncestors(
int threshold,
bool cut_with_word_coverage,
bool exclude_itself);
00351
void extractAncestors(
Node* node,
Set ancestors,
int level,
int level_threshold);
00352
void extractAncestors(
Node* node,
Set ancestors,
int word_coverage_threshold);
00353
void extractDescendants(
Node* node,
Set sense_descendants,
Set word_descendants);
00354
void extractStrictDescendants(
Node* node,
Set sense_descendants,
Set word_descendants);
00355
void extractDescendants();
00356
void computeWordSenseUniqueIds();
00357
void init(
bool differentiate_unknown_words =
true);
00358
void createBaseSynsets();
00359
void processUnknownWord(
int word_id);
00360
void finalize();
00361
void propagatePOSTypes();
00362
void propagatePOSTypes(
Node* node);
00363
void linkUpperCategories();
00364
00365
00366
Node* checkForAlreadyExtractedSynset(SynsetPtr ssp);
00367
vector<string> getSynsetWords(SynsetPtr ssp);
00368
bool catchSpecialTags(
string word);
00369
void reduceWordPolysemy(
int word_id,
int level);
00370
void reduceWordPolysemy_preserveSenseOverlapping(
int word_id,
int level);
00371
void reduceWordPolysemy_preserveSenseOverlapping2(
int word_id,
int level);
00372
00373
void getCategoriesAtLevel(
int ss_id,
int cur_level,
int target_level,
set<int>& categories);
00374
void getCategoriesUnderLevel(
int ss_id,
int cur_level,
int target_level,
Set categories);
00375
void visitUpward(
Node* node);
00376
void unvisitDownward(
Node *node);
00377
void unvisitAll();
00378
void printOntology(
Node* node,
int level = 0);
00379
00380
00381
00382
00383
00384
00385 map<int, TVec<int> >
temp_word_to_senses;
00386 map<int, TVec<int> >
temp_word_to_noun_senses;
00387 map<int, TVec<int> >
temp_word_to_verb_senses;
00388 map<int, TVec<int> >
temp_word_to_adj_senses;
00389 map<int, TVec<int> >
temp_word_to_adv_senses;
00390 map<int, TVec<int> >
temp_word_to_high_level_senses;
00391
00392 void fillTempWordToSensesTVecMap()
00393 {
00394
for (map<int, Set>::iterator it =
word_to_senses.begin(); it !=
word_to_senses.end(); ++it)
00395 {
00396
int w = it->first;
00397
Set senses = it->second;
00398
for (
SetIterator sit = senses.
begin(); sit != senses.
end(); ++sit)
00399
temp_word_to_senses[w].push_back(*sit);
00400 }
00401
00402
for (map<int, Set>::iterator it =
word_to_noun_senses.begin(); it !=
word_to_noun_senses.end(); ++it)
00403 {
00404
int w = it->first;
00405
Set senses = it->second;
00406
for (
SetIterator sit = senses.
begin(); sit != senses.
end(); ++sit)
00407
temp_word_to_noun_senses[w].push_back(*sit);
00408 }
00409
00410
for (map<int, Set>::iterator it =
word_to_verb_senses.begin(); it !=
word_to_verb_senses.end(); ++it)
00411 {
00412
int w = it->first;
00413
Set senses = it->second;
00414
for (
SetIterator sit = senses.
begin(); sit != senses.
end(); ++sit)
00415
temp_word_to_verb_senses[w].push_back(*sit);
00416 }
00417
00418
for (map<int, Set>::iterator it =
word_to_adj_senses.begin(); it !=
word_to_adj_senses.end(); ++it)
00419 {
00420
int w = it->first;
00421
Set senses = it->second;
00422
for (
SetIterator sit = senses.
begin(); sit != senses.
end(); ++sit)
00423
temp_word_to_adj_senses[w].push_back(*sit);
00424 }
00425
00426
for (map<int, Set>::iterator it =
word_to_adv_senses.begin(); it !=
word_to_adv_senses.end(); ++it)
00427 {
00428
int w = it->first;
00429
Set senses = it->second;
00430
for (
SetIterator sit = senses.
begin(); sit != senses.
end(); ++sit)
00431
temp_word_to_adv_senses[w].push_back(*sit);
00432 }
00433 }
00434
00435 TVec<int> getSensesForWord(
int w) {
return temp_word_to_senses[w]; }
00436
00437 void fillTempWordToHighLevelSensesTVecMap()
00438 {
00439
for (map<int, string>::iterator it =
words.begin(); it !=
words.end(); ++it)
00440 {
00441
int w = it->first;
00442
Set hl_senses = getWordHighLevelSenses(w);
00443
for (
SetIterator sit = hl_senses.
begin(); sit != hl_senses.
end(); ++sit)
00444
temp_word_to_high_level_senses[w].push_back(*sit);
00445 }
00446 }
00447 TVec<int> getHighLevelSensesForWord(
int w) {
return temp_word_to_high_level_senses[w]; }
00448
00449 TVec<int> getSecondLevelSensesForWord(
int w)
00450 {
00451
Set sl_senses;
00452
Set senses =
word_to_senses[w];
00453
for (
SetIterator sit = senses.
begin(); sit != senses.
end(); ++sit)
00454 {
00455
int s = *sit;
00456
Node* node =
synsets[s];
00457
for (
SetIterator ssit = node->
parents.
begin(); ssit != node->
parents.
end(); ++ssit)
00458 {
00459 sl_senses.
insert(*ssit);
00460 }
00461 }
00462
TVec<int> sl_senses_vec;
00463
for (
SetIterator slit = sl_senses.
begin(); slit != sl_senses.
end(); ++slit)
00464 sl_senses_vec.
push_back(*slit);
00465
return sl_senses_vec;
00466 }
00467
00468 TVec<int> getThirdLevelSensesForWord(
int w)
00469 {
00470
Set tl_senses;
00471
Set senses =
word_to_senses[w];
00472
for (
SetIterator sit = senses.
begin(); sit != senses.
end(); ++sit)
00473 {
00474
int s = *sit;
00475
Node* node =
synsets[s];
00476
for (
SetIterator slit = node->
parents.
begin(); slit != node->
parents.
end(); ++slit)
00477 {
00478
int sl_sense = *slit;
00479
Node* node = synsets[sl_sense];
00480
for (
SetIterator tlit = node->
parents.
begin(); tlit != node->
parents.
end(); ++tlit)
00481 {
00482 tl_senses.
insert(*tlit);
00483 }
00484 }
00485 }
00486
TVec<int> tl_senses_vec;
00487
for (
SetIterator tlit = tl_senses.
begin(); tlit != tl_senses.
end(); ++tlit)
00488 tl_senses_vec.
push_back(*tlit);
00489
return tl_senses_vec;
00490 }
00491
00492 };
00493
00494 }
00495
00496
#endif