00001
#include <plearn_learners/language/WordNet/WordNetOntology.h>
00002
#include <plearn/io/TypesNumeriques.h>
00003
00004
using namespace PLearn;
00005
00006
bool startsWith(
string& base,
string s);
00007
int extractFiles(
vector<string>& files,
int n_files);
00008
int convertPOS2WNO(
string pos);
00009
00010 int main(
int argc,
char** argv)
00011 {
00012
if (argc != 8)
00013
PLERROR(
"usage : n_files voc synsets ontology in.text out.ttext out.bttext (argc = %d)", argc);
00014
WordNetOntology ontology;
00015
set<string> pos_types;
00016
set<string> ignored_pos_types;
00017
string line;
00018
vector<string> files;
00019
int n_files =
toint(argv[1]);
00020
string voc_file =
tostring(argv[2]);
00021
string synset_file =
tostring(argv[3]);
00022
string ontology_file =
tostring(argv[4]);
00023
int total_lines =
extractFiles(files, n_files);
00024 ifstream input_if;
00025
ShellProgressBar progress(0, total_lines - 1,
"building tagged corpus", 50);
00026 progress.
draw();
00027
int n = 0;
00028
string in_text_file =
tostring(argv[5]);
00029 ifstream in_text(argv[5]);
00030 ofstream out_ttext(argv[6]);
00031 ofstream out_bttext(argv[7]);
00032
int n_unknown_errors = 0;
00033
int n_human_ambig_errors = 0;
00034
int n_tagged_words = 0;
00035
int n_non_tagged_words = 0;
00036
00037 ontology.
extractWord(
OOV_TAG,
ALL_WN_TYPE,
false,
false,
false);
00038 ontology.
extractWord(
NUMERIC_TAG,
ALL_WN_TYPE,
false,
false,
false);
00039 ontology.
extractWord(
PROPER_NOUN_TAG,
ALL_WN_TYPE,
false,
false,
false);
00040
00044
for (
unsigned int i = 0; i < files.size(); i++)
00045 {
00046 input_if.open(files[i].
c_str());
00047
00048
00049
00050
00051
while (!input_if.eof())
00052 {
00053 getline(input_if, line,
'\n');
00054
if (line ==
"")
continue;
00055 progress.
update(n++);
00056
vector<string> tokens =
split(line,
" <>");
00057
bool bos_found =
false;
00058
bool eos_found =
false;
00059
if (tokens[0] ==
"s") bos_found =
true;
00060
else if (tokens[0] ==
"/s") eos_found =
true;
00061
else if (tokens[0] !=
"wf")
continue;
00062
bool done =
false;
00063
string pos =
"";
00064
string lemma =
"";
00065
string wnsn_str =
"";
00066
int wnsn = -1;
00067
string lexsn =
"";
00068
string word =
"";
00069
int wno_wnsn = -1;
00070
int word_id = -1;
00071
bool tagged_word =
false;
00072
00073
if (tokens.size() > 2)
00074 {
00075
for (
unsigned int j = 0; j < tokens.size() - 2; j++)
00076 {
00077
string token = tokens[j];
00078
int ts = token.size();
00079
if (
startsWith(token,
"cmd="))
00080 done = (token.substr(4, ts) ==
"done");
00081
if (
startsWith(token,
"pos="))
00082 pos = token.substr(4, ts);
00083
if (
startsWith(token,
"lemma="))
00084 lemma = token.substr(6, ts);
00085
if (
startsWith(token,
"wnsn="))
00086 {
00087 wnsn_str = token.substr(5, ts);
00088 wnsn =
toint(token.substr(5, ts));
00089 }
00090
if (
startsWith(token,
"lexsn="))
00091 lexsn = token.substr(6, ts);
00092 }
00093 }
00094
00095
if (bos_found)
00096 word =
BOS_TAG;
00097
else if (eos_found)
00098 word =
EOS_TAG;
00099
else
00100 {
00101 word = tokens[tokens.size() - 2];
00102
if (tokens[tokens.size() - 1] !=
"/wf")
PLWARNING(
"no </wf>");
00103 }
00104
00105
string actual_word;
00106
if (lemma !=
"")
00107 actual_word = lemma;
00108
else if (word !=
"")
00109 actual_word = word;
00110
00111
else continue;
00112
00113
00114 tagged_word = (pos !=
"" && lemma !=
"" && lexsn !=
"" && done);
00115
bool human_ambig = ((wnsn_str.find(
';', 0) != string::npos) && (lexsn.find(
';', 0) != string::npos));
00116 actual_word =
lowerstring(actual_word);
00117
00118
00119
00120
if (!ontology.
containsWord(actual_word))
00121 ontology.
extractWord(actual_word,
ALL_WN_TYPE,
true,
true,
false);
00122 word_id = ontology.
getWordId(actual_word);
00123
00124
if (tagged_word && !human_ambig)
00125 {
00126
00127 wno_wnsn = ontology.
getWordSenseIdForSenseKey(lemma, lexsn);
00128
00129
if (wno_wnsn ==
WNO_ERROR)
00130 {
00131 wno_wnsn = -1;
00132 n_unknown_errors++;
00133
00134
00135
00136
00137
00138
00139
00140
00141 }
else
00142 {
00143
#ifdef CHECK
00144
Set senses = ontology.
getWordSenses(word_id);
00145
if (!senses.
contains(wno_wnsn))
00146
PLWARNING(
"weird");
00147
#endif
00148
}
00149 }
else
00150 {
00151 wno_wnsn = -1;
00152
if (human_ambig)
00153 n_human_ambig_errors++;
00154 }
00155
00156
if (tagged_word)
00157 ignored_pos_types.insert(pos);
00158
else
00159 pos_types.insert(pos);
00160
00161
if (wnsn == -1) n_non_tagged_words++;
00162
else n_tagged_words++;
00163
00164 out_ttext << ontology.
getWord(word_id) <<
" " << word_id <<
" " << wno_wnsn <<
endl;
00165
binwrite(out_bttext, word_id);
00166
binwrite(out_bttext, wno_wnsn);
00167
00168 }
00169 input_if.close();
00170 }
00171 progress.
done();
00172
00173 cout << ontology.
getAllWords().
size() <<
" words in vocabulary" <<
endl;
00174 cout <<
"vocabulary is locked" <<
endl;
00175
00179
int n_non_tagged_lines = ShellProgressBar::getWcAsciiFileLineCount(in_text_file);
00180 progress.
set(0, n_non_tagged_lines - 1,
"building non-tagged corpus", 50);
00181 progress.
reset();
00182 progress.
init();
00183 progress.
draw();
00184 n = 0;
00185
string word;
00186
while (!in_text.eof())
00187 {
00188 getline(in_text, word,
'\n');
00189
if (word ==
"")
continue;
00190 progress.
update(n++);
00191 n_non_tagged_words++;
00192
00193
00194
00195
00196
string stemmed_word =
stemWord(word);
00197
int word_id;
00198
00199
if (ontology.
containsWord(stemmed_word))
00200 {
00201 word_id = ontology.
getWordId(stemmed_word);
00202 }
else if (
looksNumeric(word.c_str()))
00203 {
00204 word_id = ontology.
getWordId(
NUMERIC_TAG);
00205 }
else
00206 {
00207 word_id = ontology.
getWordId(
OOV_TAG);
00208 }
00209
00210 out_ttext << ontology.
getWord(word_id) <<
" " << word_id <<
" " << -1 <<
endl;
00211
binwrite(out_bttext, word_id);
00212
binwrite(out_bttext, -1);
00213 }
00214 progress.
done();
00215
00216 in_text.close();
00217 out_ttext.close();
00218 out_bttext.close();
00219
00220 cout << n_unknown_errors <<
" unknown errors" <<
endl;
00221 cout << n_human_ambig_errors <<
" human ambiguity errors" <<
endl;
00222 cout << n_tagged_words <<
" tagged words" <<
endl;
00223 cout << n_non_tagged_words <<
" non-tagged words" <<
endl;
00224
00225 cout <<
"saving " << voc_file <<
", " << synset_file <<
", " << ontology_file <<
"..." <<
endl;
00226 ontology.
save(voc_file);
00227 ontology.
save(synset_file, ontology_file);
00228
00229
return 0;
00230 }
00231
00232
bool startsWith(
string& base,
string s)
00233 {
00234
if (base.size() < s.size())
return false;
00235
for (
unsigned int i = 0; i < s.size(); i++)
00236 {
00237
if (base[i] != s[i])
return false;
00238 }
00239
return true;
00240 }
00241
00242 int extractFiles(
vector<string>& files,
int n_files)
00243 {
00244
00245 ifstream b;
00246
string line;
00247
int total_lines = 0;
00248
int fc = 0;
00249
00250 b.open(
"brown1.files");
00251
00252
while (!b.eof() && fc < n_files)
00253 {
00254 getline(b, line,
'\n');
00255
if (line ==
"" ||
startsWith(line,
"#"))
continue;
00256
string file =
"/u/jauvinc/wordnet-1.6/semcor/brown1/tagfiles/" + line;
00257 total_lines += ShellProgressBar::getWcAsciiFileLineCount(file);
00258 files.push_back(file);
00259 fc++;
00260 }
00261 b.close();
00262
00263 b.open(
"brown2.files");
00264
while (!b.eof() && fc < n_files)
00265 {
00266 getline(b, line,
'\n');
00267
if (line ==
"")
continue;
00268
string file =
"/u/jauvinc/wordnet-1.6/semcor/brown2/tagfiles/" + line;
00269 total_lines += ShellProgressBar::getWcAsciiFileLineCount(file);
00270 files.push_back(file);
00271 fc++;
00272 }
00273
00274 cout <<
"retrieved " << fc <<
" files" <<
endl;
00275
00276
return total_lines;
00277 }
00278
00279 int convertPOS2WNO(
string pos)
00280 {
00281
00282
if (pos ==
"NN" || pos ==
"NNP" || pos ==
"NNS")
return NOUN_TYPE;
00283
else if (pos ==
"VB" || pos ==
"VBD" || pos ==
"VBN")
return VERB_TYPE;
00284
else if (pos ==
"JJ")
return ADJ_TYPE;
00285
else if (pos ==
"RB")
return ADV_TYPE;
00286
else return UNDEFINED_TYPE;
00287 }