PLearn: buildSemCor.cc Source File

00001 #include <plearn_learners/language/WordNet/WordNetOntology.h> 00002 00003 using namespace PLearn; 00004 00005 bool startsWith(string& base, string s); 00006 int extractFiles(vector<string>& files, int n_files); 00007 int convertPOS2WNO(string pos); 00008 00009 int main(int argc, char** argv) 00010 { 00011 if (argc != 7) 00012 PLERROR("usage : n_files voc synsets ontology out.ttext out.bttext (argc = %d)", argc); 00013 WordNetOntology ontology; 00014 set<string> pos_types; 00015 set<string> ignored_pos_types; 00016 string line; 00017 vector<string> files; 00018 int n_files = toint(argv[1]); 00019 string voc_file = tostring(argv[2]); 00020 string synset_file = tostring(argv[3]); 00021 string ontology_file = tostring(argv[4]); 00022 int total_lines = extractFiles(files, n_files); 00023 ifstream input_if; 00024 ShellProgressBar progress(0, total_lines - 1, "building", 50); 00025 progress.draw(); 00026 int n = 0; 00027 ofstream out_ttext(argv[5]); 00028 ofstream out_bttext(argv[6]); 00029 int n_unknown_errors = 0; 00030 int n_human_ambig_errors = 0; 00031 int n_tagged_words = 0; 00032 int n_non_tagged_words = 0; 00033 00034 for (unsigned int i = 0; i < files.size(); i++) 00035 { 00036 input_if.open(files[i].c_str()); 00037 00038 //cout << "processing file " << files[i] << endl; 00039 00040 while (!input_if.eof()) 00041 { 00042 getline(input_if, line, '\n'); 00043 if (line == "") continue; 00044 progress.update(n++); 00045 vector<string> tokens = split(line, " <>"); 00046 bool bos_found = false; 00047 bool eos_found = false; 00048 if (tokens[0] == "s") bos_found = true; 00049 else if (tokens[0] == "/s") eos_found = true; 00050 else if (tokens[0] != "wf") continue; 00051 bool done = false; 00052 string pos = ""; 00053 string lemma = ""; 00054 string wnsn_str = ""; 00055 int wnsn = -1; 00056 string lexsn = ""; 00057 string word = ""; 00058 int wno_wnsn = -1; 00059 int word_id = -1; 00060 bool tagged_word = false; 00061 00062 if (tokens.size() > 2) 00063 { 00064 for (unsigned int j = 0; j < tokens.size() - 2; j++) 00065 { 00066 string token = tokens[j]; 00067 int ts = token.size(); 00068 if (startsWith(token, "cmd=")) 00069 done = (token.substr(4, ts) == "done"); 00070 if (startsWith(token, "pos=")) 00071 pos = token.substr(4, ts); 00072 if (startsWith(token, "lemma=")) 00073 lemma = token.substr(6, ts); 00074 if (startsWith(token, "wnsn=")) 00075 { 00076 wnsn_str = token.substr(5, ts); 00077 wnsn = toint(token.substr(5, ts)); 00078 } 00079 if (startsWith(token, "lexsn=")) 00080 lexsn = token.substr(6, ts); 00081 } 00082 } 00083 00084 if (bos_found) 00085 word = BOS_TAG; 00086 else if (eos_found) 00087 word = EOS_TAG; 00088 else 00089 { 00090 word = tokens[tokens.size() - 2]; 00091 if (tokens[tokens.size() - 1] != "/wf") PLWARNING("no </wf>"); 00092 } 00093 00094 string actual_word; 00095 if (lemma != "") 00096 actual_word = lemma; 00097 else if (word != "") 00098 actual_word = word; 00099 //else PLERROR("no lemma nor word"); 00100 else continue; //ignoring entry 00101 00102 //tagged_word = (pos != "" && wnsn != -1 && done); 00103 tagged_word = (pos != "" && lemma != "" && lexsn != "" && done); 00104 00105 actual_word = lowerstring(actual_word); 00106 00107 //cout << actual_word << endl; 00108 00109 if (!ontology.containsWord(actual_word)) 00110 { 00111 ontology.extractWord(actual_word, ALL_WN_TYPE, true, true, false); 00112 } 00113 00114 bool human_ambig = ((wnsn_str.find(';', 0) != string::npos) && (lexsn.find(';', 0) != string::npos)); 00115 00116 word_id = ontology.getWordId(actual_word); 00117 if (tagged_word && !human_ambig) 00118 { 00119 //wno_wnsn = ontology.getWordSenseIdForWnsn(actual_word, convertPOS2WNO(pos), wnsn); 00120 wno_wnsn = ontology.getWordSenseIdForSenseKey(lemma, lexsn); 00121 00122 if (wno_wnsn == WNO_ERROR) 00123 { 00124 wno_wnsn = -1; 00125 n_unknown_errors++; 00126 /* 00127 PLWARNING("WNO_ERROR catched"); 00128 cout << "lemma = " << lemma << endl; 00129 cout << "word = " << word << endl; 00130 cout << "wnsn = " << wnsn << endl; 00131 cout << "wnsn_str = " << wnsn_str << endl; 00132 cout << "actual_word = " << actual_word << endl; 00133 cout << "lexsn = " << lexsn << endl; 00134 cout << "line = " << line << endl; 00135 */ 00136 } else 00137 { 00138 #ifdef CHECK 00139 Set senses = ontology.getWordSenses(word_id); 00140 if (!senses.contains(wno_wnsn)) 00141 PLWARNING("weird"); 00142 #endif 00143 } 00144 } else 00145 { 00146 // Set senses = ontology.getWordSenses(word_id); 00147 // cout << ontology.getWord(word_id) << " : " << senses << endl; 00148 wno_wnsn = -1; 00149 if (human_ambig) 00150 n_human_ambig_errors++; 00151 } 00152 00153 if (tagged_word) 00154 { 00155 ignored_pos_types.insert(pos); 00156 } else 00157 { 00158 pos_types.insert(pos); 00159 } 00160 00161 if (wnsn == -1) n_non_tagged_words++; 00162 else n_tagged_words++; 00163 00164 out_ttext << ontology.getWord(word_id) << " " << word_id << " " << wno_wnsn << endl; 00165 binwrite(out_bttext, word_id); 00166 binwrite(out_bttext, wno_wnsn); 00167 00168 } 00169 input_if.close(); 00170 } 00171 00172 out_ttext.close(); 00173 out_bttext.close(); 00174 00175 progress.done(); 00176 00177 ontology.save(voc_file); 00178 ontology.save(synset_file, ontology_file); 00179 00180 cout << n_unknown_errors << " unknown errors" << endl; 00181 cout << n_human_ambig_errors << " human ambiguity errors" << endl; 00182 cout << n_tagged_words << " tagged words" << endl; 00183 cout << n_non_tagged_words << " non-tagged words" << endl; 00184 00185 return 0; 00186 } 00187 00188 bool startsWith(string& base, string s) 00189 { 00190 if (base.size() < s.size()) return false; 00191 for (unsigned int i = 0; i < s.size(); i++) 00192 { 00193 if (base[i] != s[i]) return false; 00194 } 00195 return true; 00196 } 00197 00198 int extractFiles(vector<string>& files, int n_files) 00199 { 00200 ifstream b("/u/jauvinc/brown/brown1.files"); 00201 string line; 00202 int total_lines = 0; 00203 int fc = 0; 00204 while (!b.eof() && fc < n_files) 00205 { 00206 getline(b, line, '\n'); 00207 if (line == "") continue; 00208 string file = "/u/jauvinc/wordnet-1.6/semcor/brown1/tagfiles/" + line; 00209 total_lines += ShellProgressBar::getWcAsciiFileLineCount(file); 00210 files.push_back(file); 00211 fc++; 00212 } 00213 b.close(); 00214 00215 b.open("/u/jauvinc/brown/brown2.files"); 00216 while (!b.eof() && fc < n_files) 00217 { 00218 getline(b, line, '\n'); 00219 if (line == "") continue; 00220 string file = "/u/jauvinc/wordnet-1.6/semcor/brown2/tagfiles/" + line; 00221 total_lines += ShellProgressBar::getWcAsciiFileLineCount(file); 00222 files.push_back(file); 00223 fc++; 00224 } 00225 00226 cout << "retrieved " << fc << " files" << endl; 00227 00228 return total_lines; 00229 } 00230 00231 int convertPOS2WNO(string pos) 00232 { 00233 // JJ NN NNP NNS RB VB VBD VBN 00234 if (pos == "NN" || pos == "NNP" || pos == "NNS") return NOUN_TYPE; 00235 else if (pos == "VB" || pos == "VBD" || pos == "VBN") return VERB_TYPE; 00236 else if (pos == "JJ") return ADJ_TYPE; 00237 else if (pos == "RB") return ADV_TYPE; 00238 else return UNDEFINED_TYPE; 00239 }