00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
#include "WordNetOntology.h"
00042
#include <algo.h>
00043
#include <plearn/base/stringutils.h>
00044
00045
namespace PLearn {
00046
00047
using namespace std;
00048
00049 #define NOWARNING
00050
00051 WordNetOntology::WordNetOntology()
00052 {
00053
init();
00054
createBaseSynsets();
00055 }
00056
00057 WordNetOntology::WordNetOntology(
string voc_file,
00058
bool differentiate_unknown_words,
00059
bool pre_compute_ancestors,
00060
bool pre_compute_descendants,
00061
int wn_pos_type,
00062
int word_coverage_threshold)
00063 {
00064
init(differentiate_unknown_words);
00065
createBaseSynsets();
00066
extract(voc_file, wn_pos_type);
00067
if (pre_compute_descendants)
00068
extractDescendants();
00069
if (pre_compute_ancestors)
00070
extractAncestors(word_coverage_threshold,
true,
true);
00071 }
00072
00073 WordNetOntology::WordNetOntology(
string voc_file,
00074
string synset_file,
00075
string ontology_file,
00076
bool pre_compute_ancestors,
00077
bool pre_compute_descendants,
00078
int word_coverage_threshold)
00079 {
00080
init();
00081
00082
load(voc_file, synset_file, ontology_file);
00083
if (pre_compute_descendants)
00084
extractDescendants();
00085
if (pre_compute_ancestors)
00086
extractAncestors(word_coverage_threshold,
true,
true);
00087 }
00088
00089 WordNetOntology::WordNetOntology(
string voc_file,
00090
string synset_file,
00091
string ontology_file,
00092
string sense_key_file,
00093
bool pre_compute_ancestors,
00094
bool pre_compute_descendants,
00095
int word_coverage_threshold)
00096 {
00097
init();
00098
00099
load(voc_file, synset_file, ontology_file, sense_key_file);
00100
if (pre_compute_descendants)
00101
extractDescendants();
00102
if (pre_compute_ancestors)
00103
extractAncestors(word_coverage_threshold,
true,
true);
00104 }
00105
00106 void WordNetOntology::init(
bool the_differentiate_unknown_words)
00107 {
00108
if (wninit() != 0) {
00109
00110 }
00111
00112
noun_count = 0;
00113
verb_count = 0;
00114
adj_count = 0;
00115
adv_count = 0;
00116
00117
synset_index =
EOS_SS_ID + 1;
00118
word_index = 0;
00119
unknown_sense_index = 0;
00120
00121
noun_sense_count = 0;
00122
verb_sense_count = 0;
00123
adj_sense_count = 0;
00124
adv_sense_count = 0;
00125
00126
in_wn_word_count = 0;
00127
out_of_wn_word_count = 0;
00128
00129
are_ancestors_extracted =
false;
00130
are_descendants_extracted =
false;
00131
are_predominent_pos_extracted =
false;
00132
are_word_high_level_senses_extracted =
false;
00133
are_word_sense_unique_ids_computed =
false;
00134
00135
n_word_high_level_senses = 0;
00136
00137
differentiate_unknown_words = the_differentiate_unknown_words;
00138 }
00139
00140 void WordNetOntology::createBaseSynsets()
00141 {
00142
00143
Node* root_node =
new Node(
ROOT_SS_ID);
00144 root_node->
syns.push_back(
"ROOT");
00145 root_node->
types.
insert(
UNDEFINED_TYPE);
00146 root_node->
gloss =
"(root concept)";
00147 root_node->
hereiam =
ROOT_OFFSET;
00148
synsets[
ROOT_SS_ID] = root_node;
00149
00150
00151
00152 Node* unk_node =
new Node(
SUPER_UNKNOWN_SS_ID);
00153 unk_node->
syns.push_back(
"SUPER_UNKNOWN");
00154 unk_node->
types.
insert(
UNDEFINED_TYPE);
00155 unk_node->
gloss =
"(super-unknown concept)";
00156 unk_node->
hereiam =
SUPER_UNKNOWN_OFFSET;
00157
synsets[
SUPER_UNKNOWN_SS_ID] = unk_node;
00158
00159
00160
00161 unk_node->
parents.
insert(
ROOT_SS_ID);
00162 root_node->children.insert(
SUPER_UNKNOWN_SS_ID);
00163
00164
00165 Node* oov_node =
new Node(
OOV_SS_ID);
00166 oov_node->
syns.push_back(
"OOV");
00167 oov_node->
types.
insert(
UNDEFINED_TYPE);
00168 oov_node->
gloss =
"(out-of-vocabulary)";
00169 oov_node->
hereiam =
OOV_OFFSET;
00170
synsets[
OOV_SS_ID] = oov_node;
00171
00172
00173
00174 oov_node->
parents.
insert(
SUPER_UNKNOWN_SS_ID);
00175 unk_node->children.insert(
OOV_SS_ID);
00176
00177
00178 Node* proper_node =
new Node(
PROPER_NOUN_SS_ID);
00179 proper_node->
syns.push_back(
"PROPER NOUN");
00180 proper_node->
types.
insert(
UNDEFINED_TYPE);
00181 proper_node->
gloss =
"(proper noun)";
00182 proper_node->
hereiam =
PROPER_NOUN_OFFSET;
00183
synsets[
PROPER_NOUN_SS_ID] = proper_node;
00184
00185
00186 Node* num_node =
new Node(
NUMERIC_SS_ID);
00187 num_node->
syns.push_back(
"NUMERIC");
00188 num_node->
types.
insert(
UNDEFINED_TYPE);
00189 num_node->
gloss =
"(numeric)";
00190 num_node->
hereiam =
NUMERIC_OFFSET;
00191
synsets[
NUMERIC_SS_ID] = num_node;
00192
00193
00194 Node* punct_node =
new Node(
PUNCTUATION_SS_ID);
00195 punct_node->
syns.push_back(
"PUNCTUATION");
00196 punct_node->
types.
insert(
UNDEFINED_TYPE);
00197 punct_node->
gloss =
"(punctuation)";
00198 punct_node->
hereiam =
PUNCTUATION_OFFSET;
00199
synsets[
PUNCTUATION_SS_ID] = punct_node;
00200
00201
00202 Node* stop_node =
new Node(
STOP_SS_ID);
00203 stop_node->
syns.push_back(
"STOP");
00204 stop_node->
types.
insert(
UNDEFINED_TYPE);
00205 stop_node->
gloss =
"(stop)";
00206 stop_node->
hereiam =
STOP_OFFSET;
00207
synsets[
STOP_SS_ID] = stop_node;
00208
00209 Node* bos_node =
new Node(
BOS_SS_ID);
00210 bos_node->
syns.push_back(
"BOS");
00211 bos_node->
types.
insert(
UNDEFINED_TYPE);
00212 bos_node->
gloss =
"(BOS)";
00213 bos_node->
hereiam =
BOS_OFFSET;
00214
synsets[
BOS_SS_ID] = bos_node;
00215
00216 Node* eos_node =
new Node(
EOS_SS_ID);
00217 eos_node->
syns.push_back(
"EOS");
00218 eos_node->
types.
insert(
UNDEFINED_TYPE);
00219 eos_node->
gloss =
"(EOS)";
00220 eos_node->
hereiam =
EOS_OFFSET;
00221
synsets[
EOS_SS_ID] = eos_node;
00222
00223
00224 proper_node->
parents.
insert(
SUPER_UNKNOWN_SS_ID);
00225 unk_node->children.insert(
PROPER_NOUN_SS_ID);
00226 num_node->parents.insert(
SUPER_UNKNOWN_SS_ID);
00227 unk_node->children.insert(
NUMERIC_SS_ID);
00228 punct_node->parents.insert(
SUPER_UNKNOWN_SS_ID);
00229 unk_node->children.insert(
PUNCTUATION_SS_ID);
00230 stop_node->parents.insert(
SUPER_UNKNOWN_SS_ID);
00231 unk_node->children.insert(
STOP_SS_ID);
00232 bos_node->parents.insert(
SUPER_UNKNOWN_SS_ID);
00233 unk_node->children.insert(
BOS_SS_ID);
00234 eos_node->parents.insert(
SUPER_UNKNOWN_SS_ID);
00235 unk_node->children.insert(
EOS_SS_ID);
00236
00237
00238 Node* noun_node =
new Node(
NOUN_SS_ID);
00239 noun_node->
syns.push_back(
"NOUN");
00240 noun_node->
types.
insert(
UNDEFINED_TYPE);
00241 noun_node->
gloss =
"(noun concept)";
00242 noun_node->
hereiam =
NOUN_OFFSET;
00243
synsets[
NOUN_SS_ID] = noun_node;
00244
00245
00246 Node* verb_node =
new Node(
VERB_SS_ID);
00247 verb_node->
syns.push_back(
"VERB");
00248 verb_node->
types.
insert(
UNDEFINED_TYPE);
00249 verb_node->
gloss =
"(verb concept)";
00250 verb_node->
hereiam =
VERB_OFFSET;
00251
synsets[
VERB_SS_ID] = verb_node;
00252
00253
00254 Node* adj_node =
new Node(
ADJ_SS_ID);
00255 adj_node->
syns.push_back(
"ADJECTIVE");
00256 adj_node->
types.
insert(
UNDEFINED_TYPE);
00257 adj_node->
gloss =
"(adjective concept)";
00258 adj_node->
hereiam =
ADJ_OFFSET;
00259
synsets[
ADJ_SS_ID] = adj_node;
00260
00261
00262 Node* adv_node =
new Node(
ADV_SS_ID);
00263 adv_node->
syns.push_back(
"ADVERB");
00264 adv_node->
types.
insert(
UNDEFINED_TYPE);
00265 adv_node->
gloss =
"(adverb concept)";
00266 adv_node->
hereiam =
ADV_OFFSET;
00267
synsets[
ADV_SS_ID] = adv_node;
00268
00269
00270
00271 noun_node->
parents.
insert(
ROOT_SS_ID);
00272 root_node->children.insert(
NOUN_SS_ID);
00273 verb_node->parents.insert(
ROOT_SS_ID);
00274 root_node->children.insert(
VERB_SS_ID);
00275 adj_node->parents.insert(
ROOT_SS_ID);
00276 root_node->children.insert(
ADJ_SS_ID);
00277 adv_node->parents.insert(
ROOT_SS_ID);
00278 root_node->children.insert(
ADV_SS_ID);
00279
00280 }
00281
00282 void WordNetOntology::extract(
string voc_file,
int wn_pos_type)
00283 {
00284
int n_lines = ShellProgressBar::getAsciiFileLineCount(voc_file);
00285
ShellProgressBar progress(0, n_lines - 1,
"extracting ontology", 50);
00286 progress.
draw();
00287 ifstream input_if(voc_file.c_str());
00288
string word;
00289
while (!input_if.eof())
00290 {
00291 getline(input_if, word,
'\n');
00292
if (word ==
"")
continue;
00293
if (word[0] ==
'#' && word[1] ==
'#')
continue;
00294
extractWord(word, wn_pos_type,
true,
true,
false);
00295 progress.
update(
word_index);
00296 }
00297 input_if.close();
00298 progress.
done();
00299
finalize();
00300 input_if.close();
00301 }
00302
00303 bool WordNetOntology::isInWordNet(
string word,
bool trim_word,
bool stem_word,
bool remove_undescores)
00304 {
00305
if (trim_word)
00306 word =
trimWord(word);
00307
00308
if (remove_undescores)
00309 word =
underscore_to_space(word);
00310
00311
if (word ==
NULL_TAG)
00312 {
00313
return false;
00314 }
else
00315 {
00316
bool found_noun =
hasSenseInWordNet(word,
NOUN_TYPE);
00317
bool found_verb = hasSenseInWordNet(word,
VERB_TYPE);
00318
bool found_adj = hasSenseInWordNet(word,
ADJ_TYPE);
00319
bool found_adv = hasSenseInWordNet(word,
ADV_TYPE);
00320
bool found_stemmed_noun =
false;
00321
bool found_stemmed_verb =
false;
00322
bool found_stemmed_adj =
false;
00323
bool found_stemmed_adv =
false;
00324
00325
if (stem_word)
00326 {
00327
string stemmed_word =
stemWord(word, NOUN);
00328
if (stemmed_word != word)
00329 found_stemmed_noun = hasSenseInWordNet(stemmed_word,
NOUN_TYPE);
00330 stemmed_word =
stemWord(word, VERB);
00331
if (stemmed_word != word)
00332 found_stemmed_verb = hasSenseInWordNet(stemmed_word,
VERB_TYPE);
00333 stemmed_word =
stemWord(word, ADJ);
00334
if (stemmed_word != word)
00335 found_stemmed_adj = hasSenseInWordNet(stemmed_word,
ADJ_TYPE);
00336 stemmed_word =
stemWord(word, ADV);
00337
if (stemmed_word != word)
00338 found_stemmed_adv = hasSenseInWordNet(stemmed_word,
ADV_TYPE);
00339 }
00340
00341
if (found_noun || found_verb || found_adj || found_adv ||
00342 found_stemmed_noun || found_stemmed_verb || found_stemmed_adj || found_stemmed_adv)
00343 {
00344
return true;
00345 }
else
00346 {
00347
return false;
00348 }
00349 }
00350 }
00351
00352 bool WordNetOntology::hasSenseInWordNet(
string word,
int wn_pos_type)
00353 {
00354
00355
char* cword =
cstr(word);
00356 SynsetPtr ssp = NULL;
00357
00358
switch (wn_pos_type)
00359 {
00360
case NOUN_TYPE:
00361 ssp = findtheinfo_ds(cword, NOUN, -HYPERPTR, ALLSENSES);
00362
break;
00363
case VERB_TYPE:
00364 ssp = findtheinfo_ds(cword, VERB, -HYPERPTR, ALLSENSES);
00365
break;
00366
case ADJ_TYPE:
00367 ssp = findtheinfo_ds(cword, ADJ, -HYPERPTR, ALLSENSES);
00368
break;
00369
case ADV_TYPE:
00370 ssp = findtheinfo_ds(cword, ADV, -HYPERPTR, ALLSENSES);
00371
break;
00372 }
00373
00374
bool ssp_is_null = (ssp == NULL);
00375
00376
delete(cword);
00377 free_syns(ssp);
00378
00379
return !ssp_is_null;
00380
00381 }
00382
00383 void WordNetOntology::extractWord(
string original_word,
int wn_pos_type,
bool trim_word,
bool stem_word,
bool remove_undescores)
00384 {
00385
bool found_noun =
false;
00386
bool found_verb =
false;
00387
bool found_adj =
false;
00388
bool found_adv =
false;
00389
bool found_stemmed_noun =
false;
00390
bool found_stemmed_verb =
false;
00391
bool found_stemmed_adj =
false;
00392
bool found_stemmed_adv =
false;
00393
bool found =
false;
00394
string processed_word = original_word;
00395
string stemmed_word;
00396
00397
words[
word_index] = original_word;
00398
words_id[original_word] =
word_index;
00399
00400
if (!
catchSpecialTags(original_word))
00401 {
00402
if (trim_word)
00403 processed_word =
trimWord(original_word);
00404
00405
if (remove_undescores)
00406 processed_word =
underscore_to_space(processed_word);
00407
00408
if (processed_word ==
NULL_TAG)
00409 {
00410
out_of_wn_word_count++;
00411
processUnknownWord(word_index);
00412
word_is_in_wn[word_index] =
false;
00413 }
else
00414 {
00415
if (wn_pos_type ==
NOUN_TYPE || wn_pos_type ==
ALL_WN_TYPE)
00416 found_noun =
extractSenses(original_word, processed_word,
NOUN_TYPE);
00417
if (wn_pos_type ==
VERB_TYPE || wn_pos_type ==
ALL_WN_TYPE)
00418 found_verb = extractSenses(original_word, processed_word,
VERB_TYPE);
00419
if (wn_pos_type ==
ADJ_TYPE || wn_pos_type ==
ALL_WN_TYPE)
00420 found_adj = extractSenses(original_word, processed_word,
ADJ_TYPE);
00421
if (wn_pos_type ==
ADV_TYPE || wn_pos_type ==
ALL_WN_TYPE)
00422 found_adv = extractSenses(original_word, processed_word,
ADV_TYPE);
00423
00424
if (stem_word)
00425 {
00426
if (wn_pos_type ==
NOUN_TYPE || wn_pos_type ==
ALL_WN_TYPE)
00427 {
00428 stemmed_word =
stemWord(processed_word, NOUN);
00429
if (stemmed_word != processed_word)
00430 found_stemmed_noun = extractSenses(original_word, stemmed_word,
NOUN_TYPE);
00431 }
00432
if (wn_pos_type ==
VERB_TYPE || wn_pos_type ==
ALL_WN_TYPE)
00433 {
00434 stemmed_word =
stemWord(processed_word, VERB);
00435
if (stemmed_word != processed_word)
00436 found_stemmed_verb = extractSenses(original_word, stemmed_word,
VERB_TYPE);
00437 }
00438
if (wn_pos_type ==
ADJ_TYPE || wn_pos_type ==
ALL_WN_TYPE)
00439 {
00440 stemmed_word =
stemWord(processed_word, ADJ);
00441
if (stemmed_word != processed_word)
00442 found_stemmed_adj = extractSenses(original_word, stemmed_word,
ADJ_TYPE);
00443 }
00444
if (wn_pos_type ==
ADV_TYPE || wn_pos_type ==
ALL_WN_TYPE)
00445 {
00446 stemmed_word =
stemWord(processed_word, ADV);
00447
if (stemmed_word != processed_word)
00448 found_stemmed_adv = extractSenses(original_word, stemmed_word,
ADV_TYPE);
00449 }
00450 }
00451
00452 found = (found_noun || found_verb || found_adj || found_adv ||
00453 found_stemmed_noun || found_stemmed_verb || found_stemmed_adj || found_stemmed_adv);
00454
if (found)
00455 {
00456
in_wn_word_count++;
00457
word_is_in_wn[word_index] =
true;
00458 }
else
00459 {
00460
out_of_wn_word_count++;
00461
processUnknownWord(word_index);
00462
word_is_in_wn[word_index] =
false;
00463 }
00464 }
00465 }
else
00466 {
00467
out_of_wn_word_count++;
00468
word_is_in_wn[word_index] =
false;
00469 }
00470
if (
word_to_senses[word_index].isEmpty())
00471
PLWARNING(
"word %d (%s) was not processed correctly (found = %d)", word_index,
words[word_index].
c_str(), found);
00472 word_index++;
00473
00474 }
00475
00476
00477
Node *
00478 WordNetOntology::findSynsetFromSynsAndGloss(
const vector<string> &syns,
const string &gloss,
const long offset,
const int fnum)
00479 {
00480
for (map<int, Node *>::iterator it =
synsets.begin(); it !=
synsets.end(); ++it) {
00481
Node *node = it->second;
00482
if ((node->
gloss == gloss) && (node->
syns == syns) && (node->
hereiam == offset) && (node->
fnum == fnum))
00483
return node;
00484 }
00485
return NULL;
00486 }
00487
00488
void
00489 WordNetOntology::extractTaggedWordFrequencies(map<
int, map<int, int> > &word_senses_to_tagged_frequencies)
00490 {
00491
00492
00493
00494
00495
00496
00497 cout <<
"in WordNetOntology::extractTaggedWordFrequencies()" <<
endl;
00498
vector<int> dbases;
00499 dbases.reserve(4);
00500 dbases.push_back(NOUN);
00501 dbases.push_back(VERB);
00502 dbases.push_back(ADJ);
00503 dbases.push_back(ADV);
00504
int dbases_size = dbases.size();
00505
00506 word_senses_to_tagged_frequencies.clear();
00507
vector<string> syns;
00508
string gloss;
00509
long offset;
00510
int fnum;
00511
00512
int total_senses_found = 0;
00513
ShellProgressBar progress(0,
words.size() * dbases_size,
"[Extracting word-sense tagged frequencies]", 50);
00514 progress.
draw();
00515
int ws2tf_i = 0;
00516
00517
00518
for (
int i = 0; i < dbases_size; ++i) {
00519
00520
for (map<int, string>::iterator w_it =
words.begin(); w_it !=
words.end(); ++w_it) {
00521 progress.
update(++ws2tf_i);
00522
char *cword =
cstr(w_it->second);
00523 wnresults.numforms = wnresults.printcnt = 0;
00524 SynsetPtr ssp = findtheinfo_ds(cword, dbases[i], -HYPERPTR, ALLSENSES);
00525
if (ssp != NULL) {
00526 IndexPtr idx;
00527 SynsetPtr cursyn;
00528
while ((idx = getindex(cword, dbases[i])) != NULL) {
00529 cword = NULL;
00530
if (idx->tagged_cnt) {
00531 map<int, map<int, int> >::iterator ws2tf_it = word_senses_to_tagged_frequencies.find(w_it->first);
00532
if (ws2tf_it == word_senses_to_tagged_frequencies.end()) {
00533 word_senses_to_tagged_frequencies[w_it->first] = map<int, int>();
00534 ws2tf_it = word_senses_to_tagged_frequencies.find(w_it->first);
00535 }
00536
00537
for (
int l = 0; l < idx->sense_cnt; ++l) {
00538
if ((cursyn = read_synset(dbases[i], idx->offset[l], idx->wd)) != NULL) {
00539
00540
int freq = -1;
00541 wnresults.OutSenseCount[wnresults.numforms]++;
00542
00543
00544
00545
00546 syns =
getSynsetWords(cursyn);
00547 gloss =
string(cursyn->defn);
00548 offset = cursyn->hereiam;
00549 fnum = cursyn->fnum;
00550
00551
Node *node =
findSynsetFromSynsAndGloss(syns, gloss, offset, fnum);
00552
if (node != NULL) {
00553 (ws2tf_it->second)[node->
ss_id] = freq;
00554 ++total_senses_found;
00555 }
00556
00557 free_synset(cursyn);
00558 }
00559 }
00560 }
00561 wnresults.numforms++;
00562 free_index(idx);
00563 }
00564 free_syns(ssp);
00565 }
00566 }
00567 }
00568 progress.
done();
00569 cout <<
"FOUND A GRAND TOTAL OF " << total_senses_found <<
" senses" <<
endl;
00570 }
00571
00572
00573
00574
00575
00576
00577
00578
00579
00580
00581
00582
00583
00584
00585
00586
00587
00588
00589
00590
00591
00592
00593
00594
00595
00596
00597
00598
00599
00600
00601
00602
00603
00604 bool WordNetOntology::extractSenses(
string original_word,
string processed_word,
int wn_pos_type)
00605 {
00606
00607
00608
char* cword =
cstr(processed_word);
00609 SynsetPtr ssp = NULL;
00610 IndexPtr idx = getindex(cword, wn_pos_type);
00611
00612
switch (wn_pos_type)
00613 {
00614
case NOUN_TYPE:
00615 ssp = findtheinfo_ds(cword, NOUN, -HYPERPTR, ALLSENSES);
00616
break;
00617
case VERB_TYPE:
00618 ssp = findtheinfo_ds(cword, VERB, -HYPERPTR, ALLSENSES);
00619
break;
00620
case ADJ_TYPE:
00621 ssp = findtheinfo_ds(cword, ADJ, -HYPERPTR, ALLSENSES);
00622
break;
00623
case ADV_TYPE:
00624 ssp = findtheinfo_ds(cword, ADV, -HYPERPTR, ALLSENSES);
00625
break;
00626 }
00627
00628
if (ssp == NULL)
00629 {
00630
return false;
00631 }
else
00632 {
00633
switch (wn_pos_type)
00634 {
00635
case NOUN_TYPE:
00636
noun_count++;
00637
break;
00638
case VERB_TYPE:
00639
verb_count++;
00640
break;
00641
case ADJ_TYPE:
00642
adj_count++;
00643
break;
00644
case ADV_TYPE:
00645
adv_count++;
00646
break;
00647 }
00648
00649
int wnsn = 0;
00650
00651
while (ssp != NULL)
00652 {
00653 wnsn++;
00654
Node* node =
checkForAlreadyExtractedSynset(ssp);
00655
if (node == NULL)
00656 {
00657
00658
switch (wn_pos_type)
00659 {
00660
case NOUN_TYPE:
00661
noun_sense_count++;
00662
break;
00663
case VERB_TYPE:
00664
verb_sense_count++;
00665
break;
00666
case ADJ_TYPE:
00667
adj_sense_count++;
00668
break;
00669
case ADV_TYPE:
00670
adv_sense_count++;
00671
break;
00672 }
00673
00674
00675 node =
extractOntology(ssp);
00676 }
00677
00678
int word_id =
words_id[original_word];
00679 node->
types.
insert(wn_pos_type);
00680
word_to_senses[word_id].insert(node->
ss_id);
00681
sense_to_words[node->
ss_id].insert(word_id);
00682
00683
char *charsk = WNSnsToStr(idx, wnsn);
00684
string sense_key(charsk);
00685
00686
00687
00688 pair<int, string> ss(word_id,sense_key);
00689
if (
sense_key_to_ss_id.find(ss) ==
sense_key_to_ss_id.end())
00690
sense_key_to_ss_id[ss] = node->
ss_id;
00691 pair<int, int>
ws(word_id, node->
ss_id);
00692
00693
00694
00695
00696
00697
if (
ws_id_to_sense_key.find(
ws) ==
ws_id_to_sense_key.end())
00698
ws_id_to_sense_key[
ws] = sense_key;
00699
00700
00701
00702
switch(wn_pos_type)
00703 {
00704
case NOUN_TYPE:
00705
word_to_noun_wnsn[word_id].push_back(node->
ss_id);
00706
word_to_noun_senses[word_id].insert(node->
ss_id);
00707
break;
00708
case VERB_TYPE:
00709
word_to_verb_wnsn[word_id].push_back(node->
ss_id);
00710
word_to_verb_senses[word_id].insert(node->
ss_id);
00711
break;
00712
case ADJ_TYPE:
00713
word_to_adj_wnsn[word_id].push_back(node->
ss_id);
00714
word_to_adj_senses[word_id].insert(node->
ss_id);
00715
break;
00716
case ADV_TYPE:
00717
word_to_adv_wnsn[word_id].push_back(node->
ss_id);
00718
word_to_adv_senses[word_id].insert(node->
ss_id);
00719
break;
00720 }
00721
00722 ssp = ssp->nextss;
00723 }
00724 free_syns(ssp);
00725
return true;
00726 }
00727 }
00728
00729 Node* WordNetOntology::extractOntology(SynsetPtr ssp)
00730 {
00731
Node* node =
new Node(
synset_index++);
00732 node->
syns =
getSynsetWords(ssp);
00733
string defn = ssp->defn;
00734
removeDelimiters(defn,
"*",
"%");
00735
removeDelimiters(defn,
"|",
"/");
00736 node->
gloss = defn;
00737 node->
hereiam = ssp->hereiam;
00738 node->
fnum = ssp->fnum;
00739 node->
is_unknown =
false;
00740
synsets[node->
ss_id] = node;
00741
00742 ssp = ssp->ptrlist;
00743
00744
while (ssp != NULL)
00745 {
00746 Node* parent_node =
checkForAlreadyExtractedSynset(ssp);
00747
if (parent_node == NULL)
00748 {
00749 parent_node =
extractOntology(ssp);
00750 }
00751
00752
if (parent_node->
ss_id != node->ss_id && !(node->children.contains(parent_node->
ss_id)))
00753 {
00754 node->parents.insert(parent_node->
ss_id);
00755 parent_node->
children.
insert(node->ss_id);
00756 }
00757
00758 ssp = ssp->nextss;
00759 }
00760
return node;
00761 }
00762
00763 bool WordNetOntology::catchSpecialTags(
string word)
00764 {
00765
int word_id =
words_id[word];
00766
if (word ==
OOV_TAG)
00767 {
00768
word_to_senses[word_id].insert(
OOV_SS_ID);
00769
sense_to_words[
OOV_SS_ID].insert(word_id);
00770
return true;
00771 }
else if (word ==
PROPER_NOUN_TAG)
00772 {
00773
word_to_senses[word_id].insert(
PROPER_NOUN_SS_ID);
00774
sense_to_words[
PROPER_NOUN_SS_ID].insert(word_id);
00775
return true;
00776 }
else if (word ==
NUMERIC_TAG)
00777 {
00778
word_to_senses[word_id].insert(
NUMERIC_SS_ID);
00779
sense_to_words[
NUMERIC_SS_ID].insert(word_id);
00780
return true;
00781 }
else if (word ==
PUNCTUATION_TAG)
00782 {
00783
word_to_senses[word_id].insert(
PUNCTUATION_SS_ID);
00784
sense_to_words[
PUNCTUATION_SS_ID].insert(word_id);
00785
return true;
00786 }
else if (word ==
STOP_TAG)
00787 {
00788
word_to_senses[word_id].insert(
STOP_SS_ID);
00789
sense_to_words[
STOP_SS_ID].insert(word_id);
00790
return true;
00791 }
else if (word ==
BOS_TAG)
00792 {
00793
word_to_senses[word_id].insert(
BOS_SS_ID);
00794
sense_to_words[
BOS_SS_ID].insert(word_id);
00795
return true;
00796 }
else if (word ==
EOS_TAG)
00797 {
00798
word_to_senses[word_id].insert(
EOS_SS_ID);
00799
sense_to_words[
EOS_SS_ID].insert(word_id);
00800
return true;
00801 }
00802
return false;
00803 }
00804
00805 void WordNetOntology::lookForSpecialTags()
00806 {
00807
if (!
isSense(
OOV_SS_ID))
00808
PLWARNING(
"no <oov> tag found");
00809
if (!
isSense(
PROPER_NOUN_SS_ID))
00810
PLWARNING(
"no <proper_noun> tag found");
00811
if (!
isSense(
NUMERIC_SS_ID))
00812
PLWARNING(
"no <numeric> tag found");
00813
if (!
isSense(
PUNCTUATION_SS_ID))
00814
PLWARNING(
"no <punctuation> tag found");
00815
if (!
isSense(
STOP_SS_ID))
00816
PLWARNING(
"no <stop> tag found");
00817 }
00818
00819 void WordNetOntology::finalize()
00820 {
00821
propagatePOSTypes();
00822
linkUpperCategories();
00823
removeNonReachableSynsets();
00824 }
00825
00826 int WordNetOntology::getWordSenseIdForWnsn(
string word,
int wn_pos_type,
int wnsn)
00827 {
00828
if (!
isWord(word))
00829 {
00830
#ifndef NOWARNING
00831
PLWARNING(
"asking for a non-word (%s)", word.c_str());
00832
#endif
00833
return WNO_ERROR;
00834 }
00835
00836
int word_id =
words_id[word];
00837
switch (wn_pos_type)
00838 {
00839
case NOUN_TYPE:
00840
if (wnsn > (
int)
word_to_noun_wnsn[word_id].size())
00841 {
00842
#ifndef NOWARNING
00843
PLWARNING(
"invalid noun wnsn (%d)", wnsn);
00844
#endif
00845
return WNO_ERROR;
00846 }
else
00847
return word_to_noun_wnsn[word_id][wnsn - 1];
00848
break;
00849
case VERB_TYPE:
00850
if (wnsn > (
int)
word_to_verb_wnsn[word_id].size())
00851 {
00852
#ifndef NOWARNING
00853
PLWARNING(
"invalid verb wnsn (%d)", wnsn);
00854
#endif
00855
return WNO_ERROR;
00856 }
else
00857
return word_to_verb_wnsn[word_id][wnsn - 1];
00858
break;
00859
case ADJ_TYPE:
00860
if (wnsn > (
int)
word_to_adj_wnsn[word_id].size())
00861 {
00862
#ifndef NOWARNING
00863
PLWARNING(
"invalid adj wnsn (%d)", wnsn);
00864
#endif
00865
return WNO_ERROR;
00866 }
else
00867
return word_to_adj_wnsn[word_id][wnsn - 1];
00868
break;
00869
case ADV_TYPE:
00870
if (wnsn > (
int)
word_to_adv_wnsn[word_id].size())
00871 {
00872
#ifndef NOWARNING
00873
PLWARNING(
"invalid adv wnsn (%d)", wnsn);
00874
#endif
00875
return WNO_ERROR;
00876 }
else
00877
return word_to_adv_wnsn[word_id][wnsn - 1];
00878
break;
00879
default:
00880
#ifndef NOWARNING
00881
PLWARNING(
"undefined type");
00882
#endif
00883
return WNO_ERROR;
00884 }
00885 }
00886
00887 int WordNetOntology::getWordSenseIdForSenseKey(
string lemma,
string lexsn,
string word)
00888 {
00889
string sense_key = lemma +
"%" + lexsn;
00890
char* csense_key =
cstr(sense_key);
00891 SynsetPtr ssp = GetSynsetForSense(csense_key);
00892
if (ssp != NULL)
00893 {
00894
vector<string> synset_words =
getSynsetWords(ssp);
00895
string gloss = ssp->defn;
00896
int word_id =
words_id[word];
00897
long offset = ssp->hereiam;
00898
int fnum = ssp->fnum;
00899
for (
SetIterator it =
word_to_senses[word_id].begin(); it != word_to_senses[word_id].end(); ++it)
00900 {
00901
Node* node =
synsets[*it];
00902
if (node->
syns == synset_words && node->
gloss == gloss && node->
hereiam == offset && node->
fnum == fnum)
00903
return node->
ss_id;
00904 }
00905 }
00906
return WNO_ERROR;
00907 }
00908
00909 void WordNetOntology::processUnknownWord(
int word_id)
00910 {
00911
if (
differentiate_unknown_words)
00912 {
00913
00914
Node* unk_node =
new Node(
synset_index++);
00915
int unknown_sense_id =
unknown_sense_index++;
00916 unk_node->
syns.push_back(
"UNKNOWN_SENSE_" +
tostring(unknown_sense_id));
00917 unk_node->
gloss =
"(unknown sense " +
tostring(unknown_sense_id) +
")";
00918 unk_node->
types.
insert(
UNDEFINED_TYPE);
00919 unk_node->
hereiam =
EOS_OFFSET - unknown_sense_id - 1;
00920
synsets[unk_node->
ss_id] = unk_node;
00921
00922
00923 unk_node->
parents.
insert(
SUPER_UNKNOWN_SS_ID);
00924
synsets[
SUPER_UNKNOWN_SS_ID]->children.insert(unk_node->ss_id);
00925
00926
word_to_senses[word_id].insert(unk_node->ss_id);
00927
sense_to_words[unk_node->ss_id].insert(word_id);
00928 }
else
00929 {
00930
word_to_senses[word_id].insert(
SUPER_UNKNOWN_SS_ID);
00931
sense_to_words[
SUPER_UNKNOWN_SS_ID].insert(word_id);
00932 }
00933
00934 }
00935
00936 void WordNetOntology::propagatePOSTypes()
00937 {
00938
for (map<int, Set>::iterator it =
sense_to_words.begin(); it !=
sense_to_words.end(); ++it)
00939 {
00940
Node* node =
synsets[it->first];
00941
propagatePOSTypes(node);
00942 }
00943
unvisitAll();
00944 }
00945
00946 void WordNetOntology::propagatePOSTypes(
Node* node)
00947 {
00948 node->
visited =
true;
00949
for (
SetIterator it = node->
parents.
begin(); it != node->
parents.
end(); ++it)
00950 {
00951
Node* parent_node =
synsets[*it];
00952
for (
SetIterator iit = node->
types.
begin(); iit != node->
types.
end(); ++iit)
00953 {
00954 parent_node->
types.
insert(*iit);
00955 }
00956
if (parent_node->
types.
size() > 1)
00957 {
00958
#ifndef NOWARNING
00959
PLWARNING(
"a synset has more than 1 type");
00960
#endif
00961
}
00962
if (!parent_node->
visited)
00963
propagatePOSTypes(parent_node);
00964 }
00965 }
00966
00967 void WordNetOntology::unvisitAll()
00968 {
00969
for (map<int, Node*>::iterator it =
synsets.begin(); it !=
synsets.end(); ++it)
00970 it->second->visited =
false;
00971 }
00972
00973
00974 void WordNetOntology::linkUpperCategories()
00975 {
00976
for (map<int, Node*>::iterator it =
synsets.begin(); it !=
synsets.end(); ++it)
00977 {
00978
int ss_id = it->first;
00979
Node* node = it->second;
00980
if (node->
parents.
size() == 0 && ss_id !=
ROOT_SS_ID)
00981 {
00982
bool link_directly_to_root =
true;
00983
if (node->
types.
contains(
NOUN_TYPE))
00984 {
00985 node->
parents.
insert(
NOUN_SS_ID);
00986
synsets[
NOUN_SS_ID]->children.insert(ss_id);
00987 link_directly_to_root =
false;
00988 }
00989
if (node->
types.
contains(
VERB_TYPE))
00990 {
00991 node->
parents.
insert(
VERB_SS_ID);
00992
synsets[
VERB_SS_ID]->children.insert(ss_id);
00993 link_directly_to_root =
false;
00994 }
00995
if (node->
types.
contains(
ADJ_TYPE))
00996 {
00997 node->
parents.
insert(
ADJ_SS_ID);
00998
synsets[
ADJ_SS_ID]->children.insert(ss_id);
00999 link_directly_to_root =
false;
01000 }
01001
if (node->
types.
contains(
ADV_TYPE))
01002 {
01003 node->
parents.
insert(
ADV_SS_ID);
01004
synsets[
ADV_SS_ID]->children.insert(ss_id);
01005 link_directly_to_root =
false;
01006 }
01007
if (link_directly_to_root)
01008 {
01009 node->
parents.
insert(
ROOT_SS_ID);
01010
synsets[
ROOT_SS_ID]->children.insert(ss_id);
01011 }
01012 }
01013 }
01014 }
01015
01016
01017
01018
01019
01020
01021
01022
01023
01024
01025
01026
01027
01028
01029
01030
01031
01032
01033
01034
01035
01036
01037
01038 Node* WordNetOntology::checkForAlreadyExtractedSynset(SynsetPtr ssp)
01039 {
01040
vector<string> syns =
getSynsetWords(ssp);
01041
string gloss = ssp->defn;
01042
long offset = ssp->hereiam;
01043
int fnum = ssp->fnum;
01044
for (map<int, Node*>::iterator it =
synsets.begin(); it !=
synsets.end(); ++it)
01045 {
01046
Node* node = it->second;
01047
if (node->
syns == syns && node->
gloss == gloss && node->
hereiam == offset && node->
fnum == fnum)
01048 {
01049
return node;
01050 }
01051 }
01052
return NULL;
01053
01054 }
01055
01056 vector<string> WordNetOntology::getSynsetWords(SynsetPtr ssp)
01057 {
01058
vector<string> syns;
01059
for (
int i = 0; i < ssp->wcount; i++)
01060 {
01061 strsubst(ssp->words[i],
'_',
' ');
01062
string word_i = ssp->words[i];
01063
removeDelimiters(word_i,
"*",
"%");
01064
removeDelimiters(word_i,
"|",
"/");
01065 syns.push_back(word_i);
01066 }
01067
return syns;
01068 }
01069
01070 void WordNetOntology::print(
bool print_ontology)
01071 {
01072
for (map<int, Set>::iterator it =
word_to_senses.begin(); it !=
word_to_senses.end(); ++it)
01073 {
01074 cout <<
words[it->first] <<
endl;
01075
for (
SetIterator iit = it->second.begin(); iit != it->second.end(); ++iit)
01076 {
01077
printSynset(*iit, 1);
01078
if (print_ontology)
01079 {
01080
printOntology(
synsets[*iit], 2);
01081 }
01082 }
01083 }
01084 }
01085
01086 void WordNetOntology::printOntology(
Node* node,
int level)
01087 {
01088
for (
SetIterator it = node->
parents.
begin(); it != node->
parents.
end(); ++it)
01089 {
01090
printSynset(*it, level);
01091
printOntology(
synsets[*it], level + 1);
01092 }
01093 }
01094
01095 void WordNetOntology::printSynset(
int ss_id,
int indent_level)
01096 {
01097
for (
int i = 0; i < indent_level; i++) cout <<
" ";
01098 cout <<
"=> ";
01099
01100
for (
vector<string>::iterator it =
synsets[ss_id]->syns.begin(); it != synsets[ss_id]->syns.end(); ++it)
01101 {
01102 cout << *it <<
", ";
01103 }
01104 cout <<
" (" << ss_id <<
")" <<
endl;
01106
for (
int i = 0; i < indent_level; i++) cout <<
" ";
01107 cout <<
"fnum: " << synsets[ss_id]->fnum <<
"synset offset: " << synsets[ss_id]->hereiam <<
" gloss = " << synsets[ss_id]->gloss <<
endl;
01108
01109
01110
01111
01112
01113
01114
01115
01116
01117
01118
01119
01120
01121
01122
01123
01124
01125
01126
01127
01128
01129
01130
01131
01132
01133 }
01134 void WordNetOntology::printSynset(
int ss_id,ostream& sout,
int indent_level)
01135 {
01136
for (
int i = 0; i < indent_level; i++) sout <<
" ";
01137 sout <<
"=> ";
01138
01139
for (
vector<string>::iterator it =
synsets[ss_id]->syns.begin(); it != synsets[ss_id]->syns.end(); ++it)
01140 {
01141 sout << *it <<
", ";
01142 }
01143 sout <<
" (" << ss_id <<
")" <<
endl;
01144
01145
for (
int i = 0; i < indent_level; i++) cout <<
" ";
01146 sout <<
"gloss = " << synsets[ss_id]->gloss <<
endl;
01147
01148 }
01149
01150
01151 void WordNetOntology::printStats()
01152 {
01153
01154
01155
01156
01157
01158
01159
01160
01161
01162
01163
01164
01165
01166 cout <<
getVocSize() <<
" words in vocabulary" <<
endl;
01167 cout <<
in_wn_word_count <<
" in WN words" <<
endl;
01168 cout <<
out_of_wn_word_count <<
" out of WN words" <<
endl;
01169 cout <<
getSenseSize() <<
" senses (" << (
real)
getSenseSize() / (
real)
getVocSize() <<
" senses per word on average)" <<
endl;
01170 cout <<
getSynsetSize() <<
" categories (ontology : sense + category, possible overlap)" <<
endl;
01171
if (
are_word_high_level_senses_extracted)
01172 {
01173 cout <<
n_word_high_level_senses <<
" high-level senses (" << (
real)
n_word_high_level_senses / (
real)
getVocSize() <<
" high-level senses per word on average)" <<
endl;
01174 }
01175 }
01176
01177 void WordNetOntology::save(
string synset_file,
string ontology_file)
01178 {
01179
01180 ofstream of_synsets(synset_file.c_str());
01181
for (map<int, Node*>::iterator it =
synsets.begin(); it !=
synsets.end(); ++it)
01182 {
01183
int ss_id = it->first;
01184
Node* node = it->second;
01185 of_synsets << ss_id <<
"*|";
01186
for (
SetIterator it = node->
types.
begin(); it != node->
types.
end(); ++it)
01187 {
01188 of_synsets << *it <<
"|";
01189 }
01190 of_synsets <<
"*|";
01191 of_synsets << node->
gloss <<
"|";
01192
for (
vector<string>::iterator iit = node->
syns.begin(); iit != node->
syns.end(); ++iit)
01193 {
01194 of_synsets << *iit <<
"|";
01195 }
01196 of_synsets <<
"*|";
01197 of_synsets << node->
fnum <<
"|";
01198 of_synsets << node->
hereiam <<
"|";
01199 of_synsets <<
endl;
01200 }
01201 of_synsets.close();
01202
01203
01204 ofstream of_ontology(ontology_file.c_str());
01205
for (map<int, Set>::iterator wit =
word_to_senses.begin(); wit !=
word_to_senses.end(); ++wit)
01206 {
01207
int word_id = wit->first;
01208 of_ontology <<
"w " << word_id <<
" " <<
word_is_in_wn[word_id] <<
endl;
01209 }
01210
for (map<int, Node*>::iterator it =
synsets.begin(); it !=
synsets.end(); ++it)
01211 {
01212
int id = it->first;
01213
Node* node = it->second;
01214
for(
SetIterator iit = node->
children.
begin(); iit != node->
children.
end(); ++iit)
01215 {
01216
int child_id = *iit;
01217 of_ontology <<
"c " <<
id <<
" " << child_id <<
endl;
01218 }
01219
if (
sense_to_words.find(
id) !=
sense_to_words.end())
01220 {
01221
for (
SetIterator iit =
sense_to_words[
id].begin(); iit != sense_to_words[
id].end(); ++iit)
01222 of_ontology <<
"s " <<
id <<
" " << (*iit) <<
endl;
01223 }
01224 }
01225
01226 of_ontology.close();
01227 }
01228
01229 void WordNetOntology::save(
string voc_file)
01230 {
01231 ofstream of_voc(voc_file.c_str());
01232
for (map<int, string>::iterator it =
words.begin(); it !=
words.end(); ++it)
01233 {
01234 of_voc << it->second <<
endl;
01235 }
01236 of_voc.close();
01237 }
01238
01239
01240 void WordNetOntology::saveVocInWordnet(
string voc_file)
01241 {
01242 ofstream of_voc(voc_file.c_str());
01243
for (map<int, string>::iterator it =
words.begin(); it !=
words.end(); ++it)
01244 {
01245
if (
word_is_in_wn[it->first] ==
false)
continue;
01246 of_voc << it->second <<
endl;
01247 }
01248 of_voc.close();
01249 }
01250
01251
01252
01253
01254 void WordNetOntology::save(
string synset_file,
string ontology_file,
string sense_key_file)
01255 {
01256
save(synset_file, ontology_file);
01257
01258 ofstream of_voc(sense_key_file.c_str());
01259
for (map<pair<int, int>,
string>::iterator it =
ws_id_to_sense_key.begin(); it !=
ws_id_to_sense_key.end(); ++it)
01260 {
01261 of_voc << it->second <<
" " << (it->first).first <<
" " << (it->first).second <<
endl;
01262 }
01263 of_voc.close();
01264 }
01265
01266
01267 void WordNetOntology::load(
string voc_file,
string synset_file,
string ontology_file)
01268 {
01269 ifstream if_voc(voc_file.c_str());
01270
if (!if_voc)
PLERROR(
"can't open %s", voc_file.c_str());
01271 ifstream if_synsets(synset_file.c_str());
01272
if (!if_synsets)
PLERROR(
"can't open %s", synset_file.c_str());
01273 ifstream if_ontology(ontology_file.c_str());
01274
if (!if_ontology)
PLERROR(
"can't open %s", ontology_file.c_str());
01275
01276
string line;
01277
int word_count = 0;
01278
while (!if_voc.eof())
01279 {
01280 getline(if_voc, line,
'\n');
01281
if (line ==
"")
continue;
01282
if (line[0] ==
'#' && line[1] ==
'#')
continue;
01283
words_id[line] = word_count;
01284
word_to_senses[word_count] =
Set();
01285
words[word_count++] = line;
01286 }
01287 if_voc.close();
01288
word_index = word_count;
01289
int line_no = 0;
01290
int ss_id = -1;
01291
while (!if_synsets.eof())
01292 {
01293 ++line_no;
01294 getline(if_synsets, line,
'\n');
01295
if (line ==
"")
continue;
01296
if (line[0] ==
'#')
continue;
01297
vector<string> tokens =
split(line,
"*");
01298
if (tokens.size() != 3 && tokens.size() != 4)
01299
PLERROR(
"the synset file has not the expected format, line %d = '%s'", line_no, line.c_str());
01300
if(tokens.size() == 3 && line_no == 1)
01301
PLWARNING(
"The synset file doesn't contain enough information for correct representation of the synsets!");
01302 ss_id =
toint(tokens[0]);
01303
vector<string> type_tokens =
split(tokens[1],
"|");
01304
vector<string> ss_tokens =
split(tokens[2],
"|");
01305
vector<string> offset_tokens;
01306
if(tokens.size() == 4) offset_tokens =
split(tokens[3],
"|");
01307
Node* node =
new Node(ss_id);
01308
for (
unsigned int i = 0; i < type_tokens.size(); i++)
01309 node->
types.
insert(
toint(type_tokens[i]));
01310 node->
gloss = ss_tokens[0];
01311
01312
for (
unsigned int i = 1; i < ss_tokens.size(); i++)
01313 {
01314
if (i == 1)
01315
if (
startsWith(ss_tokens[i],
"UNKNOWN_SENSE_"))
01316
unknown_sense_index =
toint(ss_tokens[i].substr(14, ss_tokens[i].size())) + 1;
01317 node->
syns.push_back(ss_tokens[i]);
01318 }
01319
if(tokens.size() == 4)
01320 {
01321 node->
fnum =
toint(offset_tokens[0]);
01322 node->
hereiam =
tolong(offset_tokens[1]);
01323 }
01324
synsets[node->
ss_id] = node;
01325 }
01326
synset_index = ss_id + 1;
01327 if_synsets.close();
01328
int n_lines = ShellProgressBar::getAsciiFileLineCount(ontology_file);
01329
ShellProgressBar progress(0, n_lines - 1,
"loading ontology", 50);
01330 progress.
draw();
01331
int counter = 0;
01332
while (!if_ontology.eof())
01333 {
01334 getline(if_ontology, line,
'\n');
01335 progress.
update(counter++);
01336
if (line ==
"")
continue;
01337
if (line[0] ==
'#')
continue;
01338
vector<string> tokens =
split(line);
01339
if (tokens.size() != 3)
01340 {
01341
PLERROR(
"the ontology file has not the expected format");
01342 }
01343
int id =
toint(tokens[1]);
01344
int child_id;
01345
01346
if (tokens[0] ==
"w")
01347 {
01348
bool is_in_wn =
tobool(tokens[2]);
01349
word_is_in_wn[
id] = is_in_wn;
01350
if (is_in_wn)
01351
in_wn_word_count++;
01352
else
01353
out_of_wn_word_count++;
01354 }
else if (tokens[0] ==
"s")
01355 {
01356 child_id =
toint(tokens[2]);
01357
word_to_senses[child_id].insert(
id);
01358
sense_to_words[
id].insert(child_id);
01359
for (
SetIterator tit =
synsets[
id]->types.begin(); tit != synsets[
id]->types.end(); ++tit)
01360 {
01361
int type = *tit;
01362
switch (type)
01363 {
01364
case NOUN_TYPE:
01365
word_to_noun_senses[child_id].insert(
id);
01366
break;
01367
case VERB_TYPE:
01368
word_to_verb_senses[child_id].insert(
id);
01369
break;
01370
case ADJ_TYPE:
01371
word_to_adj_senses[child_id].insert(
id);
01372
break;
01373
case ADV_TYPE:
01374
word_to_adv_senses[child_id].insert(
id);
01375
break;
01376 }
01377 }
01378 }
else if (tokens[0] ==
"c")
01379 {
01380 child_id =
toint(tokens[2]);
01381
synsets[child_id]->parents.insert(
id);
01382
synsets[
id]->children.insert(child_id);
01383 }
01384 }
01385 if_ontology.close();
01386 progress.
done();
01387 if_voc.close();
01388 if_synsets.close();
01389 if_ontology.close();
01390 }
01391
01392 void WordNetOntology::load(
string voc_file,
string synset_file,
string ontology_file,
string sense_key_file)
01393 {
01394
load(voc_file, synset_file, ontology_file);
01395
01396 ifstream if_sense_key(sense_key_file.c_str());
01397
if (!if_sense_key)
PLERROR(
"can't open %s", sense_key_file.c_str());
01398
01399
string line;
01400
while (!if_sense_key.eof())
01401 {
01402 getline(if_sense_key, line,
'\n');
01403
if (line ==
"")
continue;
01404
if (line[0] ==
'#' && line[1] ==
'#')
continue;
01405
vector<string> tokens =
split(line,
" ");
01406
if(tokens.size() != 3)
01407
PLERROR(
"sense_key_file %s not compatible", sense_key_file.c_str());
01408 pair<int, string> ss(
toint(tokens[1]), tokens[0]);
01409
sense_key_to_ss_id[ss] =
toint(tokens[2]);
01410 pair<int, int>
ws(
toint(tokens[1]),
toint(tokens[2]));
01411
ws_id_to_sense_key[
ws] = tokens[0];
01412 }
01413 if_sense_key.close();
01414 }
01415
01416 void WordNetOntology::printNodes()
01417 {
01418
for (map<int, Node*>::iterator it =
synsets.begin(); it !=
synsets.end(); ++it)
01419 {
01420
Node* node = it->second;
01421 cout <<
"Node id = " << node->
ss_id <<
" | parents = ";
01422
for (
SetIterator pit = node->
parents.
begin(); pit != node->
parents.
end(); ++pit)
01423 {
01424 cout << *pit <<
" ";
01425 }
01426 cout <<
" | children = ";
01427
for (
SetIterator cit = node->
children.
begin(); cit != node->
children.
end(); ++cit)
01428 {
01429 cout << *cit <<
" ";
01430 }
01431 cout <<
endl;
01432 }
01433 }
01434
01435 void WordNetOntology::extractAncestors(
int threshold,
bool cut_with_word_coverage,
bool exclude_itself)
01436 {
01437
#ifdef VERBOSE
01438
cout <<
"extracting ancestors... ";
01439
#endif
01440
01441
if (cut_with_word_coverage && !
are_descendants_extracted)
01442 {
01443 cout <<
"*** I need to extract descendants before I can extract ancestors with a word coverage threshold ***" <<
endl;
01444
extractDescendants();
01445 }
01446
01447
01448
int n_sense_ancestors = 0;
01449
for (map<int, Node*>::iterator it =
synsets.begin(); it !=
synsets.end(); ++it)
01450 {
01451
int ss = it->first;
01452
Node* node = it->second;
01453
Set ancestors;
01454
if (cut_with_word_coverage)
01455
extractAncestors(node, ancestors, threshold);
01456
else
01457
extractAncestors(node, ancestors, 1, threshold);
01458
if (!exclude_itself)
01459 ancestors.
insert(ss);
01460
synset_to_ancestors[ss] = ancestors;
01461 n_sense_ancestors += ancestors.size();
01462 }
01463
01464
are_ancestors_extracted =
true;
01465
01466
01467
int n_word_ancestors = 0;
01468
for (map<int, Set>::iterator it =
word_to_senses.begin(); it !=
word_to_senses.end(); ++it)
01469 {
01470
int word_id = it->first;
01471
Set senses = it->second;
01472
Set word_ancestors;
01473
for (
SetIterator it = senses.
begin(); it != senses.
end(); it++)
01474 {
01475
int sense_id = *it;
01476
Set ancestors =
getSynsetAncestors(sense_id);
01477 word_ancestors.
merge(ancestors);
01478 word_ancestors.
insert(sense_id);
01479 }
01480
word_to_ancestors[word_id] = word_ancestors;
01481 n_word_ancestors += word_ancestors.size();
01482 }
01483
01484
#ifdef VERBOSE
01485
cout <<
"(" << n_sense_ancestors <<
" sense ancestors, " << n_word_ancestors <<
" word ancestors)" <<
endl;
01486
#endif
01487
01488 }
01489
01490
01491 void WordNetOntology::extractAncestors(
Node* node,
Set ancestors,
int word_coverage_threshold)
01492 {
01493
01494
01495
01496
01497
01498
01499
01500
01501
01502
01503
01504
01505
for (
SetIterator it = node->
parents.
begin(); it != node->
parents.
end(); ++it)
01506 {
01507
int ss_id = *it;
01508
if (word_coverage_threshold == -1 ||
synset_to_word_descendants[ss_id].size() < word_coverage_threshold)
01509 {
01510 ancestors.
insert(ss_id);
01511
extractAncestors(
synsets[ss_id], ancestors, word_coverage_threshold);
01512 }
01513 }
01514 }
01515
01516
01517 void WordNetOntology::extractAncestors(
Node* node,
Set ancestors,
int level,
int level_threshold)
01518 {
01519
for (
SetIterator it = node->
parents.
begin(); it != node->
parents.
end(); ++it)
01520 {
01521 ancestors.
insert(*it);
01522
if (level_threshold == -1 || level < level_threshold)
01523
extractAncestors(
synsets[*it], ancestors, level + 1, level_threshold);
01524 }
01525 }
01526
01527 Set WordNetOntology::getSynsetAncestors(
int id,
int max_level)
01528 {
01529
if (
are_ancestors_extracted)
01530 {
01531
if (!
isSynset(
id))
01532 {
01533
#ifndef NOWARNING
01534
PLWARNING(
"asking for a non-synset id (%d)",
id);
01535
#endif
01536
}
01537
return synset_to_ancestors[
id];
01538 }
else
01539 {
01540
Set ancestors;
01541
if (
isSynset(
id))
01542 {
01543
#ifndef NOWARNING
01544
PLWARNING(
"using non-pre-computed version");
01545
#endif
01546
extractAncestors(
synsets[
id], ancestors, 1, max_level);
01547 }
else
01548 {
01549
#ifndef NOWARNING
01550
PLWARNING(
"asking for a non-synset id (%d)",
id);
01551
#endif
01552
}
01553
return ancestors;
01554 }
01555 }
01556
01557 Set WordNetOntology::getSynsetParents(
int id)
01558 {
01559
return synsets[
id]->parents;
01560 }
01561
01562 Set WordNetOntology::getWordAncestors(
int id,
int max_level)
01563 {
01564
if (
are_ancestors_extracted)
01565 {
01566
if (!
isWord(
id))
01567 {
01568
#ifndef NOWARNING
01569
PLWARNING(
"asking for a non-word id (%d)",
id);
01570
#endif
01571
}
01572
return word_to_ancestors[
id];
01573 }
else
01574 {
01575
Set word_ancestors;
01576
if (
isWord(
id))
01577 {
01578
#ifndef NOWARNING
01579
PLWARNING(
"using non-pre-computed version");
01580
#endif
01581
for (
SetIterator it =
word_to_senses[
id].begin(); it != word_to_senses[
id].end(); ++it)
01582 {
01583
int sense_id = *it;
01584 word_ancestors.
insert(sense_id);
01585
Set synset_ancestors =
getSynsetAncestors(sense_id, max_level);
01586 word_ancestors.
merge(synset_ancestors);
01587 }
01588 }
else
01589 {
01590
#ifndef NOWARNING
01591
PLWARNING(
"asking for a non-word id");
01592
#endif
01593
}
01594
01595
return word_ancestors;
01596 }
01597 }
01598
01599 bool WordNetOntology::isInWordNet(
int word_id)
01600 {
01601
#ifndef NOWARNING
01602
if (!
isWord(word_id))
01603 {
01604
PLWARNING(
"asking for a non-word id (%d)", word_id);
01605
return false;
01606 }
01607
#endif
01608
return word_is_in_wn[word_id];
01609 }
01610
01611 string WordNetOntology::getSenseKey(
int word_id,
int ss_id)
01612 {
01613 pair<int, int>
ws(word_id, ss_id);
01614
if (
ws_id_to_sense_key.find(
ws) ==
ws_id_to_sense_key.end())
01615
return "";
01616
return ws_id_to_sense_key[
ws];
01617
01618 }
01619
01620 int WordNetOntology::getSynsetIDForSenseKey(
int word_id,
string sense_key)
01621 {
01622 pair<int, string> ss(word_id,sense_key);
01623 map< pair<int, string>,
int>::iterator it =
sense_key_to_ss_id.find(ss);
01624
if(it ==
sense_key_to_ss_id.end())
01625
return -1;
01626
else
01627
return it->second;
01628 }
01629
01630 int WordNetOntology::getWordId(
string word)
01631 {
01632 map<string, int>::iterator it =
words_id.find(word);
01633
if (it ==
words_id.end())
01634 {
01635 map<string, int>::iterator iit =
words_id.find(
OOV_TAG);
01636
if (iit ==
words_id.end())
01637
return -1;
01638
else
01639
return iit->second;
01640 }
else
01641 {
01642
return it->second;
01643 }
01644
01645
01646
01647
01648
01649
01650
01651
01652
01653 }
01654
01655 string WordNetOntology::getWord(
int id)
01656 {
01657
#ifndef NOWARNING
01658
if (!
isWord(
id))
01659 {
01660
PLWARNING(
"asking for a non-word id (%d)",
id);
01661
return NULL_TAG;
01662 }
01663
#endif
01664
return words[
id];
01665 }
01666
01667 Set WordNetOntology::getWordSenses(
int id)
01668 {
01669
#ifndef NOWARNING
01670
if (!
isWord(
id))
01671 {
01672
PLWARNING(
"asking for a non-word id (%d)",
id);
01673
return Set();
01674 }
01675
#endif
01676
return word_to_senses[
id];
01677 }
01678
01679 Set WordNetOntology::getWordHighLevelSenses(
int id)
01680 {
01681
#ifndef NOWARNING
01682
if (!
isWord(
id))
01683 {
01684
PLWARNING(
"asking for a non-word id (%d)",
id);
01685
return Set();
01686 }
01687
#endif
01688
01689
if (!
are_word_high_level_senses_extracted)
01690
PLERROR(
"word high-level senses have not been extracted");
01691
01692
return word_to_high_level_senses[
id];
01693 }
01694
01695 Set WordNetOntology::getWordNounSenses(
int id)
01696 {
01697
#ifndef NOWARNING
01698
if (!
isWord(
id))
01699 {
01700
PLWARNING(
"asking for a non-word id (%d)",
id);
01701
return Set();
01702 }
01703
#endif
01704
return word_to_noun_senses[
id];
01705 }
01706
01707 Set WordNetOntology::getWordVerbSenses(
int id)
01708 {
01709
#ifndef NOWARNING
01710
if (!
isWord(
id))
01711 {
01712
PLWARNING(
"asking for a non-word id (%d)",
id);
01713
return Set();
01714 }
01715
#endif
01716
return word_to_verb_senses[
id];
01717 }
01718
01719 Set WordNetOntology::getWordAdjSenses(
int id)
01720 {
01721
#ifndef NOWARNING
01722
if (!
isWord(
id))
01723 {
01724
PLWARNING(
"asking for a non-word id (%d)",
id);
01725
return Set();
01726 }
01727
#endif
01728
return word_to_adj_senses[
id];
01729 }
01730
01731 Set WordNetOntology::getWordAdvSenses(
int id)
01732 {
01733
#ifndef NOWARNING
01734
if (!
isWord(
id))
01735 {
01736
PLWARNING(
"asking for a non-word id (%d)",
id);
01737
return Set();
01738 }
01739
#endif
01740
return word_to_adv_senses[
id];
01741 }
01742
01743 Set WordNetOntology::getWordsForSense(
int id)
01744 {
01745
#ifndef NOWARNING
01746
if (!
isSense(
id))
01747 {
01748
PLWARNING(
"asking for a non-sense id (%d)",
id);
01749
return Set();
01750 }
01751
#endif
01752
return sense_to_words[
id];
01753 }
01754
01755 Node* WordNetOntology::getSynset(
int id)
01756 {
01757
#ifndef NOWARNING
01758
if (!
isSynset(
id))
01759 {
01760
PLWARNING(
"asking for a non-synset id (%d)",
id);
01761
return NULL;
01762 }
01763
#endif
01764
#ifndef NOWARNING
01765
if (
synsets.find(
id) ==
synsets.end()) {
01766
PLWARNING(
"Asking for a non-existent synset id (%d)",
id);
01767
return NULL;
01768 }
01769
#endif
01770
return synsets[
id];
01771 }
01772
01773 void WordNetOntology::printSynsetAncestors()
01774 {
01775
if (!
are_ancestors_extracted)
01776 {
01777
extractAncestors(
WORD_COVERAGE_THRESHOLD,
true,
true);
01778 }
01779
for (map<int, Set>::iterator it =
synset_to_ancestors.begin(); it !=
synset_to_ancestors.end(); ++it)
01780 {
01781 cout << it->first <<
" -> ";
01782
for (
SetIterator iit = it->second.begin(); iit != it->second.end(); ++iit)
01783 cout << *iit <<
" ";
01784 cout <<
endl;
01785 }
01786 }
01787
01788 void WordNetOntology::printWordAncestors()
01789 {
01790
if (!
are_ancestors_extracted)
01791 {
01792
extractAncestors(
WORD_COVERAGE_THRESHOLD,
true,
true);
01793 }
01794
for (map<int, Set>::iterator it =
word_to_senses.begin(); it !=
word_to_senses.end(); ++it)
01795 {
01796
int id = it->first;
01797 cout <<
id <<
" -> ";
01798
Set ancestors =
getWordAncestors(
id);
01799
for (
SetIterator iit = ancestors.
begin(); iit != ancestors.
end(); ++iit)
01800 {
01801 cout << *iit <<
" ";
01802 }
01803 cout <<
endl;
01804 }
01805 }
01806
01807 void WordNetOntology::extractDescendants()
01808 {
01809
#ifdef VERBOSE
01810
cout <<
"extracting descendants... ";
01811
#endif
01812
01813
int n_sense_descendants = 0;
01814
int n_word_descendants = 0;
01815
for (map<int, Node*>::iterator it =
synsets.begin(); it !=
synsets.end(); ++it)
01816 {
01817
Set sense_descendants;
01818
Set word_descendants;
01819
extractDescendants(it->second, sense_descendants, word_descendants);
01820
synset_to_sense_descendants[it->first] = sense_descendants;
01821
synset_to_word_descendants[it->first] = word_descendants;
01822 n_sense_descendants += sense_descendants.size();
01823 n_word_descendants += word_descendants.size();
01824 }
01825
are_descendants_extracted =
true;
01826
01827
#ifdef VERBOSE
01828
cout <<
"(" << n_sense_descendants <<
" senses, " << n_word_descendants <<
" words)" <<
endl;
01829
#endif
01830
01831 }
01832
01833 void WordNetOntology::extractDescendants(
Node* node,
Set sense_descendants,
Set word_descendants)
01834 {
01835
int ss_id = node->
ss_id;
01836
if (
isSense(ss_id))
01837 {
01838 sense_descendants.
insert(ss_id);
01839
for (
SetIterator it =
sense_to_words[ss_id].begin(); it != sense_to_words[ss_id].end(); ++it)
01840 {
01841
int word_id = *it;
01842 word_descendants.
insert(word_id);
01843 }
01844 }
01845
for (
SetIterator it = node->
children.
begin(); it != node->
children.
end(); ++it)
01846 {
01847
extractDescendants(
synsets[*it], sense_descendants, word_descendants);
01848 }
01849 }
01850
01851
01852 void WordNetOntology::extractStrictDescendants(
Node* node,
Set sense_descendants,
Set word_descendants)
01853 {
01854
int ss_id = node->
ss_id;
01855
if (
isSense(ss_id)){
01856
for (
SetIterator it =
sense_to_words[ss_id].begin(); it != sense_to_words[ss_id].end(); ++it){
01857
int word_id = *it;
01858 word_descendants.
insert(word_id);
01859 }
01860 }
01861
for (
SetIterator it = node->
children.
begin(); it != node->
children.
end(); ++it){
01862
extractDescendants(
synsets[*it], sense_descendants, word_descendants);
01863 }
01864 }
01865
01866
01867 Set WordNetOntology::getSynsetSenseDescendants(
int id)
01868 {
01869
if (
are_descendants_extracted)
01870 {
01871
if (!
isSynset(
id))
01872 {
01873
#ifndef NOWARNING
01874
PLWARNING(
"asking for a non-synset id (%d)",
id);
01875
#endif
01876
}
01877
return synset_to_sense_descendants[
id];
01878 }
01879
01880
Set sense_descendants;
01881
if (
isSynset(
id))
01882 {
01883
#ifndef NOWARNING
01884
PLWARNING(
"using non-pre-computed version");
01885
#endif
01886
extractDescendants(
synsets[
id], sense_descendants,
Set());
01887 }
else
01888 {
01889
#ifndef NOWARNING
01890
PLWARNING(
"asking for non-synset id (%d)",
id);
01891
#endif
01892
}
01893
return sense_descendants;
01894 }
01895
01896 Set WordNetOntology::getSynsetWordDescendants(
int id)
01897 {
01898
if (
are_descendants_extracted)
01899 {
01900
if (!
isSynset(
id))
01901 {
01902
#ifndef NOWARNING
01903
PLWARNING(
"asking for a non-synset id (%d)",
id);
01904
#endif
01905
}
01906
return synset_to_word_descendants[
id];
01907 }
01908
01909
Set word_descendants;
01910
if (
isSynset(
id))
01911 {
01912
#ifndef NOWARNING
01913
PLWARNING(
"using non-pre-computed version");
01914
#endif
01915
extractDescendants(
synsets[
id],
Set(), word_descendants);
01916 }
else
01917 {
01918
#ifndef NOWARNING
01919
PLWARNING(
"asking for non-synset id (%d)",
id);
01920
#endif
01921
}
01922
return word_descendants;
01923 }
01924
01925 void WordNetOntology::printDescendants()
01926 {
01927
01928
01929
01930
01931
01932
01933
01934
01935
01936
01937
01938
01939
01940 }
01941
01942 bool WordNetOntology::isWord(
int id)
01943 {
01944
return (
words.find(
id) !=
words.end());
01945 }
01946
01947 bool WordNetOntology:: isWord(
string word)
01948 {
01949
return (
words_id.find(word) !=
words_id.end());
01950 }
01951
01952 bool WordNetOntology::isSense(
int id)
01953 {
01954
return (
sense_to_words.find(
id) !=
sense_to_words.end());
01955 }
01956
01957 bool WordNetOntology::isPureSense(
int id)
01958 {
01959
return (
isSense(
id) &&
synsets[
id]->children.size() == 0);
01960 }
01961
01962 bool WordNetOntology::isCategory(
int id)
01963 {
01964
return isSynset(
id);
01965 }
01966
01967 bool WordNetOntology::isPureCategory(
int id)
01968 {
01969
return (
isCategory(
id) && !
isSense(
id));
01970 }
01971
01972 bool WordNetOntology::isSynset(
int id)
01973 {
01974
return (
synsets.find(
id) !=
synsets.end());
01975 }
01976
01977 int WordNetOntology::overlappingSynsets(
int ss_id1,
int ss_id2)
01978 {
01979
Set words1 =
sense_to_words[ss_id1];
01980
Set words2 = sense_to_words[ss_id2];
01981
Set overlap;
01982
for (
SetIterator it1=words1.
begin();it1!=words1.
end();++it1)
01983
if (words2.
contains(*it1))
01984 overlap.
insert(*it1);
01985
01986
01987
return overlap.
size();
01988 }
01989
01990 Set WordNetOntology::getAllWords()
01991 {
01992
Set all_words;
01993
for (map<int, string>::iterator it =
words.begin(); it !=
words.end(); ++it)
01994 {
01995 all_words.
insert(it->first);
01996 }
01997
return all_words;
01998 }
01999
02000 Set WordNetOntology::getAllSenses()
02001 {
02002
Set senses;
02003
for (map<int, Set>::iterator it =
sense_to_words.begin(); it !=
sense_to_words.end(); ++it)
02004 {
02005 senses.
insert(it->first);
02006 }
02007
return senses;
02008 }
02009
02010 Set WordNetOntology::getAllCategories()
02011 {
02012
Set categories;
02013
for (map<int, Node*>::iterator it =
synsets.begin(); it !=
synsets.end(); ++it)
02014 {
02015 categories.
insert(it->first);
02016 }
02017
return categories;
02018 }
02019
02020 void WordNetOntology::printWordOntology(
int id)
02021 {
02022 cout <<
words[
id] <<
endl;
02023
for (
SetIterator sit =
word_to_senses[
id].begin(); sit != word_to_senses[
id].end(); ++sit)
02024 {
02025
int sense_id = *sit;
02026
printSynset(sense_id, 1);
02027
printOntology(
synsets[sense_id], 2);
02028 }
02029 }
02030
02031 void WordNetOntology::printWordOntology(
string word)
02032 {
02033
printWordOntology(
words_id[word]);
02034 }
02035
02036 void WordNetOntology::printInvertedSynsetOntology(
int id,
int level)
02037 {
02038
if (
isSynset(
id))
02039 {
02040
printSynset(
id, level);
02041
for (
SetIterator it =
synsets[
id]->children.begin(); it != synsets[
id]->children.end(); ++it)
02042 {
02043
printInvertedSynsetOntology(*it, level + 1);
02044 }
02045 }
else
02046 {
02047
#ifndef NOWARNING
02048
PLWARNING(
"asking for a non-synset id (%d)",
id);
02049
#endif
02050
}
02051 }
02052
02053 void WordNetOntology::intersectAncestorsAndSenses(
Set categories,
Set senses)
02054 {
02055
02056
02057
for (map<int, Set>::iterator it =
word_to_ancestors.begin(); it !=
word_to_ancestors.end(); ++it)
02058 {
02059 it->second.intersection(categories);
02060 }
02061
02062
02063
02064
Set keys_to_be_removed;
02065
for (map<int, Set>::iterator it =
synset_to_ancestors.begin(); it !=
synset_to_ancestors.end(); ++it)
02066 {
02067
if (!categories.
contains(it->first) && !senses.
contains(it->first))
02068 keys_to_be_removed.
insert(it->first);
02069 }
02070
02071
for (
SetIterator it = keys_to_be_removed.
begin(); it != keys_to_be_removed.
end(); ++it)
02072 {
02073
synset_to_ancestors.erase(*it);
02074
synsets.erase(*it);
02075 }
02076
02077
02078
02079
for (map<int, Set>::iterator it =
synset_to_ancestors.begin(); it !=
synset_to_ancestors.end(); ++it)
02080 {
02081 it->second.intersection(categories);
02082 }
02083
02084
02085
02086
for (map<int, Set>::iterator it =
word_to_senses.begin(); it !=
word_to_senses.end(); ++it)
02087 {
02088 it->second.intersection(senses);
02089 }
02090
02091 keys_to_be_removed->
clear();
02092
for (map<int, Set>::iterator it =
sense_to_words.begin(); it !=
sense_to_words.end(); ++it)
02093 {
02094
if (!senses.
contains(it->first))
02095 keys_to_be_removed.
insert(it->first);
02096 }
02097
02098
for (
SetIterator it = keys_to_be_removed.
begin(); it != keys_to_be_removed.
end(); ++it)
02099 {
02100
sense_to_words.erase(*it);
02101 }
02102 }
02103
02104 bool WordNetOntology::isWordUnknown(
string word)
02105 {
02106
return isWordUnknown(
words_id[word]);
02107 }
02108
02109 bool WordNetOntology::isWordUnknown(
int id)
02110 {
02111
bool is_unknown =
true;
02112
for (
SetIterator it =
word_to_senses[
id].begin(); it != word_to_senses[
id].end(); ++it)
02113 {
02114
if (!
synsets[*it]->is_unknown)
02115 is_unknown =
false;
02116 }
02117
return is_unknown;
02118 }
02119
02120 bool WordNetOntology::isSynsetUnknown(
int id)
02121 {
02122
return synsets[
id]->is_unknown;
02123 }
02124
02125
02126
02127
02128
02129
02130
02131
02132
02133
02134
02135
02136
02137
02138
02139
02140
02141
02142
02143
02144
02145
02146
02147
02148
02149
02150
02151
02152
02153
02154
02155 void WordNetOntology::getDownToUpParentCategoriesAtLevel(
int ss_id,
int target_level,
Set categories,
int cur_level)
02156 {
02157
Node* node =
synsets[ss_id];
02158
if (cur_level == target_level && !
isTopLevelCategory(ss_id))
02159 {
02160 categories.
insert(ss_id);
02161 }
else
02162 {
02163
for (
SetIterator it = node->
parents.
begin(); it != node->
parents.
end(); ++it)
02164
getDownToUpParentCategoriesAtLevel(*it, target_level, categories, cur_level + 1);
02165 }
02166 }
02167
02168 void WordNetOntology::getCategoriesAtLevel(
int ss_id,
int cur_level,
int target_level,
set<int>& categories)
02169 {
02170
Node* node =
synsets[ss_id];
02171
if (cur_level == target_level && !
isTopLevelCategory(ss_id))
02172 {
02173 categories.insert(ss_id);
02174 }
else
02175 {
02176
for (
SetIterator it = node->
parents.
begin(); it != node->
parents.
end(); ++it)
02177 {
02178
getCategoriesAtLevel(*it, cur_level + 1, target_level, categories);
02179 }
02180 }
02181 }
02182
02183 void WordNetOntology::getCategoriesUnderLevel(
int ss_id,
int cur_level,
int target_level,
Set categories)
02184 {
02185
Node* node =
synsets[ss_id];
02186
if (!
isTopLevelCategory(ss_id))
02187 categories.
insert(ss_id);
02188
if (cur_level != target_level)
02189 {
02190
for (
SetIterator it = node->
parents.
begin(); it != node->
parents.
end(); ++it)
02191
getCategoriesUnderLevel(*it, cur_level + 1, target_level, categories);
02192 }
02193 }
02194
02195 void WordNetOntology::reducePolysemy(
int level)
02196 {
02197
ShellProgressBar progress(0,
words.size() - 1,
"reducing polysemy", 50);
02198 progress.
init();
02199 progress.
draw();
02200
int count = 0;
02201
for (map<int, string>::iterator it =
words.begin(); it !=
words.end(); ++it)
02202 {
02203
int word_id = it->first;
02204
02205
reduceWordPolysemy_preserveSenseOverlapping(word_id, level);
02206 progress.
update(
count++);
02207 }
02208 progress.
done();
02209
removeNonReachableSynsets();
02210 }
02211
02212
02213
02214
02215 void WordNetOntology::reduceWordPolysemy(
int word_id,
int level)
02216 {
02217
Set senses =
word_to_senses[word_id];
02218
Set senses_to_be_removed;
02219
if (senses.
size() > 1)
02220 {
02221
02222
set<set<int> > ss;
02223
for (
SetIterator it = senses.
begin(); it != senses.
end(); ++it)
02224 {
02225
int sense_id = *it;
02226
set<int> categories_at_level;
02227
getCategoriesAtLevel(sense_id, 0, level, categories_at_level);
02228
02229
02230
02231
02232
02233
if (categories_at_level.size() != 0)
02234 {
02235
02236
02237
02238
bool already_there = (ss.find(categories_at_level) != ss.end());
02239
if (already_there)
02240 {
02241
02242 senses_to_be_removed.
insert(sense_id);
02243
sense_to_words[sense_id].remove(word_id);
02244
02245
if (
sense_to_words[sense_id].isEmpty())
02246
sense_to_words.erase(sense_id);
02247 }
else
02248 {
02249 ss.insert(categories_at_level);
02250
02251 }
02252 }
else
02253 {
02254
02255 }
02256 }
02257
02258
for (
SetIterator it = senses_to_be_removed.
begin(); it != senses_to_be_removed.
end(); ++it)
02259 {
02260
int sense_id = *it;
02261 word_to_senses[word_id].
remove(sense_id);
02262
word_to_noun_senses[word_id].remove(sense_id);
02263
word_to_verb_senses[word_id].remove(sense_id);
02264
word_to_adj_senses[word_id].remove(sense_id);
02265
word_to_adv_senses[word_id].remove(sense_id);
02266 }
02267 }
02268 }
02269
02270 void WordNetOntology::reduceWordPolysemy_preserveSenseOverlapping(
int word_id,
int level)
02271 {
02272
Set senses =
word_to_senses[word_id];
02273
Set senses_to_be_removed;
02274 map<set<int>,
Set> categories_to_senses;
02275
if (senses.
size() > 1)
02276 {
02277
for (
SetIterator it = senses.
begin(); it != senses.
end(); ++it)
02278 {
02279
int sense_id = *it;
02280
set<int> categories_at_level;
02281
getCategoriesAtLevel(sense_id, 0, level, categories_at_level);
02282
if (categories_at_level.size() != 0)
02283 categories_to_senses[categories_at_level].insert(sense_id);
02284 }
02285
02286
for (map<set<int>,
Set>::iterator it = categories_to_senses.begin(); it != categories_to_senses.end(); ++it)
02287 {
02288
Set sense_cluster = it->second;
02289
if (sense_cluster.
size() > 1)
02290 {
02291
int sense_cluster_size = sense_cluster.
size();
02292
int n_sense_removed = 0;
02293
for (
SetIterator sit = sense_cluster.
begin(); sit != sense_cluster.
end(); ++sit)
02294 {
02295
int sense_id = *sit;
02296
if (
sense_to_words[sense_id].size() < 2 && n_sense_removed < (sense_cluster_size - 1))
02297 {
02298 senses_to_be_removed.
insert(sense_id);
02299
sense_to_words[sense_id].remove(word_id);
02300
02301
if (
sense_to_words[sense_id].isEmpty())
02302
sense_to_words.erase(sense_id);
02303 n_sense_removed++;
02304 }
02305 }
02306 }
02307 }
02308
02309
if (!senses_to_be_removed.
isEmpty())
02310 {
02311 cout <<
words[word_id] <<
endl;
02312
02313
02314 }
02315
02316
02317
for (
SetIterator it = senses_to_be_removed.
begin(); it != senses_to_be_removed.
end(); ++it)
02318 {
02319
int sense_id = *it;
02320
02321
printSynset(sense_id, 1);
02322
02323 word_to_senses[word_id].
remove(sense_id);
02324
word_to_noun_senses[word_id].remove(sense_id);
02325
word_to_verb_senses[word_id].remove(sense_id);
02326
word_to_adj_senses[word_id].remove(sense_id);
02327
word_to_adv_senses[word_id].remove(sense_id);
02328 }
02329 }
02330 }
02331
02332 void WordNetOntology::reduceWordPolysemy_preserveSenseOverlapping2(
int word_id,
int level)
02333 {
02334
02335
02336
02337
02338
02339
02340
02341
02342
02343
02344
02345
02346
02347
02348
02349
02350
02351
02352
02353
02354
02355
02356
02357
02358
02359
02360
02361
02362
02363
02364
02365
02366
02367
02368
02369 }
02370
02371
02372
02373 void WordNetOntology::removeNonReachableSynsets()
02374 {
02375
02376
for (map<int, Set>::iterator wit =
word_to_senses.begin(); wit !=
word_to_senses.end(); ++wit)
02377 {
02378
Set senses = wit->second;
02379
for (
SetIterator sit = senses.
begin(); sit != senses.
end(); ++sit)
02380 {
02381
int sense_id = *sit;
02382
visitUpward(
synsets[sense_id]);
02383 }
02384 }
02385
02386
Set synsets_to_be_removed;
02387
for (map<int, Node*>::iterator sit =
synsets.begin(); sit !=
synsets.end(); ++sit)
02388 {
02389
int ss_id = sit->first;
02390
Node* node = sit->second;
02391
if (!node->
visited)
02392 {
02393 synsets_to_be_removed.
insert(ss_id);
02394 }
else
02395 {
02396
02397
02398
Set children_to_be_removed;
02399
for (
SetIterator cit = node->
children.
begin(); cit != node->
children.
end(); ++cit)
02400 {
02401
int child_id = *cit;
02402
if (!
synsets[child_id]->visited)
02403 children_to_be_removed.
insert(child_id);
02404 }
02405
02406
for (
SetIterator rit = children_to_be_removed.
begin(); rit != children_to_be_removed.
end(); ++rit)
02407 node->
children.
remove(*rit);
02408 }
02409 }
02410
02411
02412
for (
SetIterator rit = synsets_to_be_removed.
begin(); rit != synsets_to_be_removed.
end(); ++rit)
02413 {
02414
int ss_id = *rit;
02415
delete(
synsets[ss_id]);
02416
synsets.erase(ss_id);
02417 }
02418
02419
unvisitAll();
02420 }
02421
02422 void WordNetOntology::removeWord(
int id)
02423 {
02424
string word_string =
words[
id];
02425 words.erase(
id);
02426
word_to_senses.erase(
id);
02427
word_to_noun_senses.erase(
id);
02428
word_to_verb_senses.erase(
id);
02429
word_to_adj_senses.erase(
id);
02430
word_to_adv_senses.erase(
id);
02431
words_id.erase(word_string);
02432
word_to_noun_wnsn.erase(
id);
02433
word_to_verb_wnsn.erase(
id);
02434
word_to_adj_wnsn.erase(
id);
02435
word_to_adv_wnsn.erase(
id);
02436
word_to_predominent_pos.erase(
id);
02437
02438
word_to_high_level_senses.erase(
id);
02439 }
02440
02441
02442 void WordNetOntology::visitUpward(
Node* node)
02443 {
02444 node->
visited =
true;
02445
for (
SetIterator pit = node->
parents.
begin(); pit != node->
parents.
end(); ++pit)
02446 {
02447
int parent_id = *pit;
02448
if (!
synsets[parent_id]->visited)
02449
visitUpward(
synsets[parent_id]);
02450 }
02451 }
02452
02453
void
02454 WordNetOntology::unvisitDownward(
Node *node)
02455 {
02456 node->
visited =
false;
02457
for (
SetIterator s_it = node->
children.
begin(); s_it != node->
children.
end(); ++s_it) {
02458
Node *child =
synsets[*s_it];
02459
if (child->
visited)
02460
unvisitDownward(child);
02461 }
02462 }
02463
02464 void WordNetOntology::detectWordsWithoutOntology()
02465 {
02466
for (map<int, Set>::iterator it =
word_to_senses.begin(); it !=
word_to_senses.end(); ++it)
02467 {
02468
int word_id = it->first;
02469
Set senses = it->second;
02470
if (senses.
isEmpty())
02471
PLWARNING(
"word %d (%s) has no attached ontology", word_id,
words[word_id].
c_str());
02472 }
02473 }
02474
02475 int WordNetOntology::getMaxSynsetId()
02476 {
02477
return synsets.rbegin()->first;
02478 }
02479
02480 Set WordNetOntology::getSyntacticClassesForWord(
int word_id)
02481 {
02482
#ifndef NOWARNING
02483
if (!
isWord(word_id))
02484
PLWARNING(
"asking for a non-word id (%d)", word_id);
02485
#endif
02486
Set syntactic_classes;
02487
Set senses =
word_to_senses[word_id];
02488
for (
SetIterator it = senses.
begin(); it != senses.
end(); ++it)
02489 {
02490
Node* node =
synsets[*it];
02491
for (
SetIterator tit = node->
types.
begin(); tit != node->
types.
end(); ++tit)
02492 syntactic_classes.
insert(*tit);
02493 }
02494
return syntactic_classes;
02495 }
02496
02497 int WordNetOntology::getSyntacticClassForSense(
int sense_id)
02498 {
02499
#ifndef NOWARNING
02500
if (!
isSense(sense_id))
02501
PLWARNING(
"asking for a non-sense id (%d)", sense_id);
02502
#endif
02503
Node* sense =
synsets[sense_id];
02504
if (sense->
types.
size() > 1)
02505
PLWARNING(
"a sense has more than 1 POS type");
02506
int type = *(sense->
types.
begin());
02507
return type;
02508 }
02509
02510 int WordNetOntology::getPredominentSyntacticClassForWord(
int word_id)
02511 {
02512
#ifndef NOWARNING
02513
if (!
isWord(word_id))
02514
PLWARNING(
"asking for a non-word id (%d)", word_id);
02515
#endif
02516
if (
are_predominent_pos_extracted)
02517
return word_to_predominent_pos[word_id];
02518
int n_noun = 0;
02519
int n_verb = 0;
02520
int n_adj = 0;
02521
int n_adv = 0;
02522
Set senses =
word_to_senses[word_id];
02523
for (
SetIterator it = senses.
begin(); it != senses.
end(); ++it)
02524 {
02525
int sense_id = *it;
02526
int type =
getSyntacticClassForSense(sense_id);
02527
switch (type)
02528 {
02529
case NOUN_TYPE:
02530 n_noun++;
02531
break;
02532
case VERB_TYPE:
02533 n_verb++;
02534
break;
02535
case ADJ_TYPE:
02536 n_adj++;
02537
break;
02538
case ADV_TYPE:
02539 n_adv++;
02540 }
02541 }
02542
if (n_noun == 0 && n_verb == 0 && n_adj == 0 && n_adv == 0)
02543
return UNDEFINED_TYPE;
02544
else if (n_noun >= n_verb && n_noun >= n_adj && n_noun >= n_adv)
02545
return NOUN_TYPE;
02546
else if (n_verb >= n_noun && n_verb >= n_adj && n_verb >= n_adv)
02547
return VERB_TYPE;
02548
else if (n_adj >= n_noun && n_adj >= n_verb && n_adj >= n_adv)
02549
return ADJ_TYPE;
02550
else
02551
return ADV_TYPE;
02552 }
02553
02554 void WordNetOntology::extractPredominentSyntacticClasses()
02555 {
02556
for (map<int, Set>::iterator it =
word_to_senses.begin(); it !=
word_to_senses.end(); ++it)
02557 {
02558
int word_id = it->first;
02559
word_to_predominent_pos[word_id] =
getPredominentSyntacticClassForWord(word_id);
02560 }
02561
are_predominent_pos_extracted =
true;
02562 }
02563
02564 void WordNetOntology::savePredominentSyntacticClasses(
string file)
02565 {
02566 ofstream out_pos(file.c_str());
02567
for (map<int, Set>::iterator it =
word_to_senses.begin(); it !=
word_to_senses.end(); ++it)
02568 {
02569
int word_id = it->first;
02570 out_pos <<
getPredominentSyntacticClassForWord(word_id) <<
endl;
02571
02572 }
02573 out_pos.close();
02574 }
02575
02576 void WordNetOntology::loadPredominentSyntacticClasses(
string file)
02577 {
02578 ifstream in_pos(file.c_str());
02579
int line_counter = 0;
02580
while (!in_pos.eof())
02581 {
02582
string line =
pgetline(in_pos);
02583
if (line ==
"")
continue;
02584
int pos =
toint(line);
02585
word_to_predominent_pos[line_counter++] = pos;
02586 }
02587 in_pos.close();
02588
are_predominent_pos_extracted =
true;
02589 }
02590
02591 bool WordNetOntology::isTopLevelCategory(
int ss_id)
02592 {
02593
return (ss_id ==
ROOT_SS_ID || ss_id ==
SUPER_UNKNOWN_SS_ID ||
02594 ss_id ==
NOUN_SS_ID || ss_id ==
VERB_SS_ID ||
02595 ss_id ==
ADJ_SS_ID || ss_id ==
ADV_SS_ID ||
02596 ss_id ==
OOV_SS_ID || ss_id ==
PROPER_NOUN_SS_ID ||
02597 ss_id ==
NUMERIC_SS_ID || ss_id ==
PUNCTUATION_SS_ID ||
02598 ss_id ==
STOP_SS_ID || ss_id ==
UNDEFINED_SS_ID ||
02599 ss_id ==
BOS_SS_ID || ss_id ==
EOS_SS_ID);
02600 }
02601
02602 void WordNetOntology::getDescendantCategoriesAtLevel(
int ss_id,
int cur_level,
int target_level,
Set categories)
02603 {
02604
if (
isSynset(ss_id))
02605 {
02606
Node* node =
synsets[ss_id];
02607
02608
02609
if (cur_level < target_level &&
isSense(ss_id))
02610 {
02611
Set words =
sense_to_words[ss_id];
02612
for (
SetIterator wit = words.
begin(); wit != words.
end(); ++wit)
02613
word_to_under_target_level_high_level_senses[*wit].insert(ss_id);
02614 }
02616
02617
if (cur_level == target_level)
02618 categories.
insert(ss_id);
02619
else
02620 {
02621
for (
SetIterator it = node->
children.
begin(); it != node->
children.
end(); ++it)
02622
getDescendantCategoriesAtLevel(*it, cur_level + 1, target_level, categories);
02623 }
02624 }
02625 }
02626
02627 void WordNetOntology::extractWordHighLevelSenses(
int noun_depth,
int verb_depth,
int adj_depth,
int adv_depth,
int unk_depth)
02628 {
02629
Set noun_categories;
02630
getDescendantCategoriesAtLevel(
NOUN_SS_ID, 0, noun_depth, noun_categories);
02631
for (
SetIterator sit = noun_categories.
begin(); sit != noun_categories.
end(); ++sit)
02632 {
02633
int ss_id = *sit;
02634
Set word_descendants =
getSynsetWordDescendants(ss_id);
02635
for (
SetIterator wit = word_descendants.
begin(); wit != word_descendants.
end(); ++wit)
02636 {
02637
int word_id = *wit;
02638
word_to_high_level_senses[word_id].insert(ss_id);
02639 }
02640 }
02641
Set verb_categories;
02642
getDescendantCategoriesAtLevel(
VERB_SS_ID, 0, verb_depth, verb_categories);
02643
for (
SetIterator sit = verb_categories.
begin(); sit != verb_categories.
end(); ++sit)
02644 {
02645
int ss_id = *sit;
02646
Set word_descendants =
getSynsetWordDescendants(ss_id);
02647
for (
SetIterator wit = word_descendants.
begin(); wit != word_descendants.
end(); ++wit)
02648 {
02649
int word_id = *wit;
02650
word_to_high_level_senses[word_id].insert(ss_id);
02651 }
02652 }
02653
Set adj_categories;
02654
getDescendantCategoriesAtLevel(
ADJ_SS_ID, 0, adj_depth, adj_categories);
02655
for (
SetIterator sit = adj_categories.
begin(); sit != adj_categories.
end(); ++sit)
02656 {
02657
int ss_id = *sit;
02658
Set word_descendants =
getSynsetWordDescendants(ss_id);
02659
for (
SetIterator wit = word_descendants.
begin(); wit != word_descendants.
end(); ++wit)
02660 {
02661
int word_id = *wit;
02662
word_to_high_level_senses[word_id].insert(ss_id);
02663 }
02664 }
02665
Set adv_categories;
02666
getDescendantCategoriesAtLevel(
ADV_SS_ID, 0, adv_depth, adv_categories);
02667
for (
SetIterator sit = adv_categories.
begin(); sit != adv_categories.
end(); ++sit)
02668 {
02669
int ss_id = *sit;
02670
Set word_descendants =
getSynsetWordDescendants(ss_id);
02671
for (
SetIterator wit = word_descendants.
begin(); wit != word_descendants.
end(); ++wit)
02672 {
02673
int word_id = *wit;
02674
word_to_high_level_senses[word_id].insert(ss_id);
02675 }
02676 }
02677
Set unk_categories;
02678
getDescendantCategoriesAtLevel(
SUPER_UNKNOWN_SS_ID, 0, unk_depth, unk_categories);
02679
for (
SetIterator sit = unk_categories.
begin(); sit != unk_categories.
end(); ++sit)
02680 {
02681
int ss_id = *sit;
02682
Set word_descendants =
getSynsetWordDescendants(ss_id);
02683
for (
SetIterator wit = word_descendants.
begin(); wit != word_descendants.
end(); ++wit)
02684 {
02685
int word_id = *wit;
02686
word_to_high_level_senses[word_id].insert(ss_id);
02687 }
02688 }
02689
02690
02691
02692
for (map<int, string>::iterator it =
words.begin(); it !=
words.end(); ++it)
02693 {
02694
int word_id = it->first;
02695
if (
word_to_high_level_senses[word_id].size() == 0)
02696
word_to_high_level_senses[word_id] =
word_to_senses[word_id];
02697
02698 }
02699
02700
are_word_high_level_senses_extracted =
true;
02701 }
02702
02703 void WordNetOntology::extractWordNounAndVerbHighLevelSenses(
int noun_depth,
int verb_depth)
02704 {
02705
for (map<int, string>::iterator it =
words.begin(); it !=
words.end(); ++it)
02706 {
02707
int word_id = it->first;
02708
word_to_high_level_senses[word_id] =
word_to_adv_senses[word_id];
02709
word_to_high_level_senses[word_id].merge(
word_to_adj_senses[word_id]);
02710 }
02711
02712
Set noun_categories;
02713
getDescendantCategoriesAtLevel(
NOUN_SS_ID, 0, noun_depth, noun_categories);
02714
for (
SetIterator sit = noun_categories.
begin(); sit != noun_categories.
end(); ++sit)
02715 {
02716
int ss_id = *sit;
02717
Set word_descendants =
getSynsetWordDescendants(ss_id);
02718
for (
SetIterator wit = word_descendants.
begin(); wit != word_descendants.
end(); ++wit)
02719 {
02720
int word_id = *wit;
02721
word_to_high_level_senses[word_id].insert(ss_id);
02722 }
02723 }
02724
Set verb_categories;
02725
getDescendantCategoriesAtLevel(
VERB_SS_ID, 0, verb_depth, verb_categories);
02726
for (
SetIterator sit = verb_categories.
begin(); sit != verb_categories.
end(); ++sit)
02727 {
02728
int ss_id = *sit;
02729
Set word_descendants =
getSynsetWordDescendants(ss_id);
02730
for (
SetIterator wit = word_descendants.
begin(); wit != word_descendants.
end(); ++wit)
02731 {
02732
int word_id = *wit;
02733
word_to_high_level_senses[word_id].insert(ss_id);
02734 }
02735 }
02736
02737
02738
for (map<int, Set>::iterator it =
word_to_under_target_level_high_level_senses.begin(); it !=
word_to_under_target_level_high_level_senses.end(); ++it)
02739 {
02740
word_to_high_level_senses[it->first].merge(it->second);
02741 }
02742
02743
for (map<int, string>::iterator it =
words.begin(); it !=
words.end(); ++it)
02744 {
02745
int word_id = it->first;
02746
if (
word_to_high_level_senses[word_id].size() == 0)
02747
word_to_high_level_senses[word_id] =
word_to_senses[word_id];
02748 }
02749
02750
are_word_high_level_senses_extracted =
true;
02751 }
02752
02753 int WordNetOntology::getWordSenseUniqueId(
int word,
int sense)
02754 {
02755
if (!
are_word_sense_unique_ids_computed)
02756
computeWordSenseUniqueIds();
02757 pair<int, int>
ws(word, sense);
02758
if (
word_sense_to_unique_id.find(
ws) ==
word_sense_to_unique_id.end())
02759
return -1;
02760
return word_sense_to_unique_id[
ws];
02761 }
02762
02763 void WordNetOntology::computeWordSenseUniqueIds()
02764 {
02765
int unique_id = 0;
02766
for (map<int, Set>::iterator wit =
word_to_senses.begin(); wit !=
word_to_senses.end(); ++wit)
02767 {
02768
int w = wit->first;
02769
Set senses = wit->second;
02770
for (
SetIterator sit = senses.
begin(); sit != senses.
end(); ++sit)
02771 {
02772
int s = *sit;
02773 pair<int, int>
ws(w, s);
02774
if (
word_sense_to_unique_id.find(
ws) !=
word_sense_to_unique_id.end())
02775
PLERROR(
"in computeWordSenseUniqueIds(): dupe word/sense keys (w = %d, s = %d)", w, s);
02776
word_sense_to_unique_id[
ws] = unique_id++;
02777 }
02778 }
02779
are_word_sense_unique_ids_computed =
true;
02780 }
02781
02782 int WordNetOntology::getWordSenseUniqueIdSize()
02783 {
02784
if (!
are_word_sense_unique_ids_computed)
02785
computeWordSenseUniqueIds();
02786
return (
int)
word_sense_to_unique_id.size();
02787 }
02788
02789
02790 string trimWord(
string word)
02791 {
02792
02793
int index = 0;
02794
bool forward_trimmed =
isLetter(word[index]) ||
isDigit(word[index]) ||
isLegalPunct(word[index]);
02795
while (!forward_trimmed)
02796 {
02797 index++;
02798
if (index > (
int)word.size())
return NULL_TAG;
02799 forward_trimmed =
isLetter(word[index]) ||
isDigit(word[index]) ||
isLegalPunct(word[index]);
02800 }
02801
02802 word = word.substr(index, word.size());
02803
02804
02805 index = word.size() - 1;
02806
bool backward_trimmed =
isLetter(word[index]) ||
isDigit(word[index]) ||
isLegalPunct(word[index]);
02807
while (!backward_trimmed)
02808 {
02809 index--;
02810
if (index < 0)
return NULL_TAG;
02811 backward_trimmed =
isLetter(word[index]) ||
isDigit(word[index]) ||
isLegalPunct(word[index]);
02812 }
02813
02814
string trimmed_word = word.substr(0, index + 1);
02815
02816
if (trimmed_word ==
".")
02817
return NULL_TAG;
02818
else
02819
return trimmed_word;
02820 }
02821
02822 bool isLetter(
char c)
02823 {
02824
return (c >= 65 && c <= 90) || (c >= 97 && c <= 122);
02825 }
02826
02827 bool isDigit(
char c)
02828 {
02829
return (c >= 48 && c <= 57);
02830 }
02831
02832 bool isAlpha(
char c)
02833 {
02834
return (
isLetter(c) ||
isDigit(c));
02835 }
02836
02837 bool isLegalPunct(
char c)
02838 {
02839
return (c ==
'.' || c ==
'_');
02840 }
02841
02842 string stemWord(
string& word)
02843 {
02844
02845
char* input_word =
cstr(word);
02846
char* lemma = morphword(input_word, NOUN);
02847
if (lemma == NULL)
02848 {
02849 lemma = morphword(input_word, VERB);
02850
if (lemma == NULL)
02851 {
02852 lemma = morphword(input_word, ADJ);
02853
if (lemma == NULL)
02854 {
02855 lemma = morphword(input_word, ADV);
02856 }
02857 }
02858 }
02859
if (lemma == NULL)
02860 {
02861
return word;
02862 }
else
02863 {
02864
02865
return string(lemma);
02866 }
02867 }
02868
02869 string stemWord(
string& word,
int wn_pos)
02870 {
02871
02872
char* input_word =
cstr(word);
02873
char* lemma = morphword(input_word, wn_pos);
02874
if (lemma == NULL)
02875
return word;
02876
else
02877
return string(lemma);
02878 }
02879
02880 char*
cstr(
string& str)
02881 {
02882
char*
cstr =
new char[str.size() + 1];
02883
for (
unsigned int i = 0; i < str.size(); i++)
02884 *(
cstr + i) = str[i];
02885
cstr[str.size()] =
'\0';
02886
return cstr;
02887 }
02888
02889 void removeDelimiters(
string& s,
string delim,
string replace)
02890 {
02891
unsigned int pos = s.find(delim, 0);
02892
while (pos != string::npos)
02893 {
02894 s.replace(pos, 1, replace);
02895 pos = s.find(delim, pos + 1);
02896 }
02897 }
02898
02899 bool startsWith(
string& base,
string s)
02900 {
02901
if (base.size() < s.size())
return false;
02902
for (
unsigned int i = 0; i < s.size(); i++)
02903
if (base[i] != s[i])
return false;
02904
return true;
02905 }
02906
02907 void replaceChars(
string& str,
string char_to_replace,
string replacing_char)
02908 {
02909
unsigned int pos = str.find(char_to_replace, 0);
02910
while (pos != string::npos)
02911 {
02912 str.replace(pos, 1, replacing_char);
02913 pos = str.find(char_to_replace, pos + 1);
02914 }
02915 }
02916
02917 }