00001
#include <plearn_learners/language/WordNet/WordNetOntology.h>
00002
00003
using namespace PLearn;
00004
00005
bool startsWith(
string& base,
string s);
00006
int extractFiles(
vector<string>& files,
int n_files);
00007
int convertPOS2WNO(
string pos);
00008
00009 int main(
int argc,
char** argv)
00010 {
00011
if (argc != 7)
00012
PLERROR(
"usage : n_files voc synsets ontology out.ttext out.bttext (argc = %d)", argc);
00013
WordNetOntology ontology;
00014
set<string> pos_types;
00015
set<string> ignored_pos_types;
00016
string line;
00017
vector<string> files;
00018
int n_files =
toint(argv[1]);
00019
string voc_file =
tostring(argv[2]);
00020
string synset_file =
tostring(argv[3]);
00021
string ontology_file =
tostring(argv[4]);
00022
int total_lines =
extractFiles(files, n_files);
00023 ifstream input_if;
00024
ShellProgressBar progress(0, total_lines - 1,
"building", 50);
00025 progress.
draw();
00026
int n = 0;
00027 ofstream out_ttext(argv[5]);
00028 ofstream out_bttext(argv[6]);
00029
int n_unknown_errors = 0;
00030
int n_human_ambig_errors = 0;
00031
int n_tagged_words = 0;
00032
int n_non_tagged_words = 0;
00033
00034
for (
unsigned int i = 0; i < files.size(); i++)
00035 {
00036 input_if.open(files[i].
c_str());
00037
00038
00039
00040
while (!input_if.eof())
00041 {
00042 getline(input_if, line,
'\n');
00043
if (line ==
"")
continue;
00044 progress.
update(n++);
00045
vector<string> tokens =
split(line,
" <>");
00046
bool bos_found =
false;
00047
bool eos_found =
false;
00048
if (tokens[0] ==
"s") bos_found =
true;
00049
else if (tokens[0] ==
"/s") eos_found =
true;
00050
else if (tokens[0] !=
"wf")
continue;
00051
bool done =
false;
00052
string pos =
"";
00053
string lemma =
"";
00054
string wnsn_str =
"";
00055
int wnsn = -1;
00056
string lexsn =
"";
00057
string word =
"";
00058
int wno_wnsn = -1;
00059
int word_id = -1;
00060
bool tagged_word =
false;
00061
00062
if (tokens.size() > 2)
00063 {
00064
for (
unsigned int j = 0; j < tokens.size() - 2; j++)
00065 {
00066
string token = tokens[j];
00067
int ts = token.size();
00068
if (
startsWith(token,
"cmd="))
00069 done = (token.substr(4, ts) ==
"done");
00070
if (
startsWith(token,
"pos="))
00071 pos = token.substr(4, ts);
00072
if (
startsWith(token,
"lemma="))
00073 lemma = token.substr(6, ts);
00074
if (
startsWith(token,
"wnsn="))
00075 {
00076 wnsn_str = token.substr(5, ts);
00077 wnsn =
toint(token.substr(5, ts));
00078 }
00079
if (
startsWith(token,
"lexsn="))
00080 lexsn = token.substr(6, ts);
00081 }
00082 }
00083
00084
if (bos_found)
00085 word =
BOS_TAG;
00086
else if (eos_found)
00087 word =
EOS_TAG;
00088
else
00089 {
00090 word = tokens[tokens.size() - 2];
00091
if (tokens[tokens.size() - 1] !=
"/wf")
PLWARNING(
"no </wf>");
00092 }
00093
00094
string actual_word;
00095
if (lemma !=
"")
00096 actual_word = lemma;
00097
else if (word !=
"")
00098 actual_word = word;
00099
00100
else continue;
00101
00102
00103 tagged_word = (pos !=
"" && lemma !=
"" && lexsn !=
"" && done);
00104
00105 actual_word =
lowerstring(actual_word);
00106
00107
00108
00109
if (!ontology.
containsWord(actual_word))
00110 {
00111 ontology.
extractWord(actual_word,
ALL_WN_TYPE,
true,
true,
false);
00112 }
00113
00114
bool human_ambig = ((wnsn_str.find(
';', 0) != string::npos) && (lexsn.find(
';', 0) != string::npos));
00115
00116 word_id = ontology.
getWordId(actual_word);
00117
if (tagged_word && !human_ambig)
00118 {
00119
00120 wno_wnsn = ontology.
getWordSenseIdForSenseKey(lemma, lexsn);
00121
00122
if (wno_wnsn ==
WNO_ERROR)
00123 {
00124 wno_wnsn = -1;
00125 n_unknown_errors++;
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136 }
else
00137 {
00138
#ifdef CHECK
00139
Set senses = ontology.
getWordSenses(word_id);
00140
if (!senses.
contains(wno_wnsn))
00141
PLWARNING(
"weird");
00142
#endif
00143
}
00144 }
else
00145 {
00146
00147
00148 wno_wnsn = -1;
00149
if (human_ambig)
00150 n_human_ambig_errors++;
00151 }
00152
00153
if (tagged_word)
00154 {
00155 ignored_pos_types.insert(pos);
00156 }
else
00157 {
00158 pos_types.insert(pos);
00159 }
00160
00161
if (wnsn == -1) n_non_tagged_words++;
00162
else n_tagged_words++;
00163
00164 out_ttext << ontology.
getWord(word_id) <<
" " << word_id <<
" " << wno_wnsn <<
endl;
00165
binwrite(out_bttext, word_id);
00166
binwrite(out_bttext, wno_wnsn);
00167
00168 }
00169 input_if.close();
00170 }
00171
00172 out_ttext.close();
00173 out_bttext.close();
00174
00175 progress.
done();
00176
00177 ontology.
save(voc_file);
00178 ontology.
save(synset_file, ontology_file);
00179
00180 cout << n_unknown_errors <<
" unknown errors" <<
endl;
00181 cout << n_human_ambig_errors <<
" human ambiguity errors" <<
endl;
00182 cout << n_tagged_words <<
" tagged words" <<
endl;
00183 cout << n_non_tagged_words <<
" non-tagged words" <<
endl;
00184
00185
return 0;
00186 }
00187
00188
bool startsWith(
string& base,
string s)
00189 {
00190
if (base.size() < s.size())
return false;
00191
for (
unsigned int i = 0; i < s.size(); i++)
00192 {
00193
if (base[i] != s[i])
return false;
00194 }
00195
return true;
00196 }
00197
00198
int extractFiles(
vector<string>& files,
int n_files)
00199 {
00200 ifstream b(
"/u/jauvinc/brown/brown1.files");
00201
string line;
00202
int total_lines = 0;
00203
int fc = 0;
00204
while (!b.eof() && fc < n_files)
00205 {
00206 getline(b, line,
'\n');
00207
if (line ==
"")
continue;
00208
string file =
"/u/jauvinc/wordnet-1.6/semcor/brown1/tagfiles/" + line;
00209 total_lines += ShellProgressBar::getWcAsciiFileLineCount(file);
00210 files.push_back(file);
00211 fc++;
00212 }
00213 b.close();
00214
00215 b.open(
"/u/jauvinc/brown/brown2.files");
00216
while (!b.eof() && fc < n_files)
00217 {
00218 getline(b, line,
'\n');
00219
if (line ==
"")
continue;
00220
string file =
"/u/jauvinc/wordnet-1.6/semcor/brown2/tagfiles/" + line;
00221 total_lines += ShellProgressBar::getWcAsciiFileLineCount(file);
00222 files.push_back(file);
00223 fc++;
00224 }
00225
00226 cout <<
"retrieved " << fc <<
" files" <<
endl;
00227
00228
return total_lines;
00229 }
00230
00231
int convertPOS2WNO(
string pos)
00232 {
00233
00234
if (pos ==
"NN" || pos ==
"NNP" || pos ==
"NNS")
return NOUN_TYPE;
00235
else if (pos ==
"VB" || pos ==
"VBD" || pos ==
"VBN")
return VERB_TYPE;
00236
else if (pos ==
"JJ")
return ADJ_TYPE;
00237
else if (pos ==
"RB")
return ADV_TYPE;
00238
else return UNDEFINED_TYPE;
00239 }