00001 #include <plearn/base/general.h> 00002 //#include "GrowingStringTable.h" 00003 #include <plearn/base/stringutils.h> 00004 #include <plearn_learners/language/WordNet/WordNetOntology.h> 00005 00006 using namespace PLearn; 00007 00008 // usage : proper_noun_filter proper_nouns stop_words < in.text > filtered_out.text 00009 00010 set<string> extractWordSet(string file) 00011 { 00012 set<string> words; 00013 ifstream in(file.c_str()); 00014 while (!in.eof()) 00015 { 00016 string line = pgetline(in); 00017 if (line == "") continue; 00018 words.insert(line); 00019 } 00020 in.close(); 00021 return words; 00022 } 00023 00024 int main(int argc, char** argv) 00025 { 00026 if (argc != 3) 00027 PLERROR("usage : proper_noun_filter proper_nouns stop_words < in.text > filtered_out.text"); 00028 //GrowingStringTable proper_nouns(argv[1]); 00029 //GrowingStringTable stop_words(argv[2]); 00030 set<string> proper_nouns = extractWordSet(argv[1]); 00031 set<string> stop_words = extractWordSet(argv[2]); 00032 WordNetOntology wn; 00033 string word; 00034 while (true) 00035 { 00036 cin >> word; 00037 if (!cin) break; 00038 word = lowerstring(word); 00039 //unsigned int pn_id = proper_nouns.elementNumber(word.c_str()); 00040 //unsigned int sw_id = stop_words.elementNumber(word.c_str()); 00041 bool is_proper_noun = proper_nouns.find(word) != proper_nouns.end(); 00042 bool is_stop_word = stop_words.find(word) != stop_words.end(); 00043 if (is_proper_noun && !is_stop_word && !wn.isInWordNet(word)) 00044 cout << "<proper_noun>" << endl; 00045 else 00046 cout << word << endl; 00047 } 00048 }