PLearn: buildBrown.cc Source File

00001 #include <plearn_learners/language/WordNet/WordNetOntology.h> 00002 #include <plearn/io/TypesNumeriques.h> 00003 00004 using namespace PLearn; 00005 00006 bool startsWith(string& base, string s); 00007 int extractFiles(vector<string>& files, int n_files); 00008 int convertPOS2WNO(string pos); 00009 00010 int main(int argc, char** argv) 00011 { 00012 if (argc != 8) 00013 PLERROR("usage : n_files voc synsets ontology in.text out.ttext out.bttext (argc = %d)", argc); 00014 WordNetOntology ontology; 00015 set<string> pos_types; 00016 set<string> ignored_pos_types; 00017 string line; 00018 vector<string> files; 00019 int n_files = toint(argv[1]); 00020 string voc_file = tostring(argv[2]); 00021 string synset_file = tostring(argv[3]); 00022 string ontology_file = tostring(argv[4]); 00023 int total_lines = extractFiles(files, n_files); 00024 ifstream input_if; 00025 ShellProgressBar progress(0, total_lines - 1, "building tagged corpus", 50); 00026 progress.draw(); 00027 int n = 0; 00028 string in_text_file = tostring(argv[5]); 00029 ifstream in_text(argv[5]); 00030 ofstream out_ttext(argv[6]); 00031 ofstream out_bttext(argv[7]); 00032 int n_unknown_errors = 0; 00033 int n_human_ambig_errors = 0; 00034 int n_tagged_words = 0; 00035 int n_non_tagged_words = 0; 00036 00037 ontology.extractWord(OOV_TAG, ALL_WN_TYPE, false, false, false); 00038 ontology.extractWord(NUMERIC_TAG, ALL_WN_TYPE, false, false, false); 00039 ontology.extractWord(PROPER_NOUN_TAG, ALL_WN_TYPE, false, false, false); 00040 00044 for (unsigned int i = 0; i < files.size(); i++) 00045 { 00046 input_if.open(files[i].c_str()); 00047 00048 //cout << "processing file " << files[i] << endl; 00049 //cout << "voc_size = " << ontology.getAllWords().size() << endl; 00050 00051 while (!input_if.eof()) 00052 { 00053 getline(input_if, line, '\n'); 00054 if (line == "") continue; 00055 progress.update(n++); 00056 vector<string> tokens = split(line, " <>"); 00057 bool bos_found = false; 00058 bool eos_found = false; 00059 if (tokens[0] == "s") bos_found = true; 00060 else if (tokens[0] == "/s") eos_found = true; 00061 else if (tokens[0] != "wf") continue; 00062 bool done = false; 00063 string pos = ""; 00064 string lemma = ""; 00065 string wnsn_str = ""; 00066 int wnsn = -1; 00067 string lexsn = ""; 00068 string word = ""; 00069 int wno_wnsn = -1; 00070 int word_id = -1; 00071 bool tagged_word = false; 00072 00073 if (tokens.size() > 2) 00074 { 00075 for (unsigned int j = 0; j < tokens.size() - 2; j++) 00076 { 00077 string token = tokens[j]; 00078 int ts = token.size(); 00079 if (startsWith(token, "cmd=")) 00080 done = (token.substr(4, ts) == "done"); 00081 if (startsWith(token, "pos=")) 00082 pos = token.substr(4, ts); 00083 if (startsWith(token, "lemma=")) 00084 lemma = token.substr(6, ts); 00085 if (startsWith(token, "wnsn=")) 00086 { 00087 wnsn_str = token.substr(5, ts); 00088 wnsn = toint(token.substr(5, ts)); 00089 } 00090 if (startsWith(token, "lexsn=")) 00091 lexsn = token.substr(6, ts); 00092 } 00093 } 00094 00095 if (bos_found) 00096 word = BOS_TAG; 00097 else if (eos_found) 00098 word = EOS_TAG; 00099 else 00100 { 00101 word = tokens[tokens.size() - 2]; 00102 if (tokens[tokens.size() - 1] != "/wf") PLWARNING("no </wf>"); 00103 } 00104 00105 string actual_word; 00106 if (lemma != "") 00107 actual_word = lemma; 00108 else if (word != "") 00109 actual_word = word; 00110 //else PLERROR("no lemma nor word"); 00111 else continue; //ignoring entry 00112 00113 //tagged_word = (pos != "" && wnsn != -1 && done); 00114 tagged_word = (pos != "" && lemma != "" && lexsn != "" && done); 00115 bool human_ambig = ((wnsn_str.find(';', 0) != string::npos) && (lexsn.find(';', 0) != string::npos)); 00116 actual_word = lowerstring(actual_word); 00117 00118 //cout << actual_word << endl; 00119 00120 if (!ontology.containsWord(actual_word)) 00121 ontology.extractWord(actual_word, ALL_WN_TYPE, true, true, false); 00122 word_id = ontology.getWordId(actual_word); 00123 00124 if (tagged_word && !human_ambig) 00125 { 00126 //wno_wnsn = ontology.getWordSenseIdForWnsn(actual_word, convertPOS2WNO(pos), wnsn); 00127 wno_wnsn = ontology.getWordSenseIdForSenseKey(lemma, lexsn); 00128 00129 if (wno_wnsn == WNO_ERROR) // SHOULD NOT HAPPEN!! 00130 { 00131 wno_wnsn = -1; 00132 n_unknown_errors++; 00133 // PLWARNING("WNO_ERROR catched"); 00134 // cout << "lemma = " << lemma << endl; 00135 // cout << "word = " << word << endl; 00136 // cout << "wnsn = " << wnsn << endl; 00137 // cout << "wnsn_str = " << wnsn_str << endl; 00138 // cout << "actual_word = " << actual_word << endl; 00139 // cout << "lexsn = " << lexsn << endl; 00140 // cout << "line = " << line << endl; 00141 } else 00142 { 00143 #ifdef CHECK 00144 Set senses = ontology.getWordSenses(word_id); 00145 if (!senses.contains(wno_wnsn)) 00146 PLWARNING("weird"); 00147 #endif 00148 } 00149 } else 00150 { 00151 wno_wnsn = -1; 00152 if (human_ambig) 00153 n_human_ambig_errors++; 00154 } 00155 00156 if (tagged_word) 00157 ignored_pos_types.insert(pos); 00158 else 00159 pos_types.insert(pos); 00160 00161 if (wnsn == -1) n_non_tagged_words++; 00162 else n_tagged_words++; 00163 00164 out_ttext << ontology.getWord(word_id) << " " << word_id << " " << wno_wnsn << endl; 00165 binwrite(out_bttext, word_id); 00166 binwrite(out_bttext, wno_wnsn); 00167 00168 } 00169 input_if.close(); 00170 } 00171 progress.done(); 00172 00173 cout << ontology.getAllWords().size() << " words in vocabulary" << endl; 00174 cout << "vocabulary is locked" << endl; 00175 00179 int n_non_tagged_lines = ShellProgressBar::getWcAsciiFileLineCount(in_text_file); 00180 progress.set(0, n_non_tagged_lines - 1, "building non-tagged corpus", 50); 00181 progress.reset(); 00182 progress.init(); 00183 progress.draw(); 00184 n = 0; 00185 string word; 00186 while (!in_text.eof()) 00187 { 00188 getline(in_text, word, '\n'); 00189 if (word == "") continue; 00190 progress.update(n++); 00191 n_non_tagged_words++; 00192 00193 // if (!ontology.containsWord(word)) 00194 // ontology.extractWord(word, ALL_WN_TYPE, true, true, false); 00195 00196 string stemmed_word = stemWord(word); 00197 int word_id; 00198 00199 if (ontology.containsWord(stemmed_word)) 00200 { 00201 word_id = ontology.getWordId(stemmed_word); 00202 } else if (looksNumeric(word.c_str())) 00203 { 00204 word_id = ontology.getWordId(NUMERIC_TAG); 00205 } else 00206 { 00207 word_id = ontology.getWordId(OOV_TAG); 00208 } 00209 00210 out_ttext << ontology.getWord(word_id) << " " << word_id << " " << -1 << endl; 00211 binwrite(out_bttext, word_id); 00212 binwrite(out_bttext, -1); 00213 } 00214 progress.done(); 00215 00216 in_text.close(); 00217 out_ttext.close(); 00218 out_bttext.close(); 00219 00220 cout << n_unknown_errors << " unknown errors" << endl; 00221 cout << n_human_ambig_errors << " human ambiguity errors" << endl; 00222 cout << n_tagged_words << " tagged words" << endl; 00223 cout << n_non_tagged_words << " non-tagged words" << endl; 00224 00225 cout << "saving " << voc_file << ", " << synset_file << ", " << ontology_file << "..." << endl; 00226 ontology.save(voc_file); 00227 ontology.save(synset_file, ontology_file); 00228 00229 return 0; 00230 } 00231 00232 bool startsWith(string& base, string s) 00233 { 00234 if (base.size() < s.size()) return false; 00235 for (unsigned int i = 0; i < s.size(); i++) 00236 { 00237 if (base[i] != s[i]) return false; 00238 } 00239 return true; 00240 } 00241 00242 int extractFiles(vector<string>& files, int n_files) 00243 { 00244 00245 ifstream b; 00246 string line; 00247 int total_lines = 0; 00248 int fc = 0; 00249 00250 b.open("brown1.files"); 00251 //ifstream b("test"); 00252 while (!b.eof() && fc < n_files) 00253 { 00254 getline(b, line, '\n'); 00255 if (line == "" || startsWith(line, "#")) continue; 00256 string file = "/u/jauvinc/wordnet-1.6/semcor/brown1/tagfiles/" + line; 00257 total_lines += ShellProgressBar::getWcAsciiFileLineCount(file); 00258 files.push_back(file); 00259 fc++; 00260 } 00261 b.close(); 00262 00263 b.open("brown2.files"); 00264 while (!b.eof() && fc < n_files) 00265 { 00266 getline(b, line, '\n'); 00267 if (line == "") continue; 00268 string file = "/u/jauvinc/wordnet-1.6/semcor/brown2/tagfiles/" + line; 00269 total_lines += ShellProgressBar::getWcAsciiFileLineCount(file); 00270 files.push_back(file); 00271 fc++; 00272 } 00273 00274 cout << "retrieved " << fc << " files" << endl; 00275 00276 return total_lines; 00277 } 00278 00279 int convertPOS2WNO(string pos) 00280 { 00281 // JJ NN NNP NNS RB VB VBD VBN 00282 if (pos == "NN" || pos == "NNP" || pos == "NNS") return NOUN_TYPE; 00283 else if (pos == "VB" || pos == "VBD" || pos == "VBN") return VERB_TYPE; 00284 else if (pos == "JJ") return ADJ_TYPE; 00285 else if (pos == "RB") return ADV_TYPE; 00286 else return UNDEFINED_TYPE; 00287 }