00001 #include <plearn/base/general.h> 00002 #include <plearn_learners/language/WordNet/WordNetOntology.h> 00003 #include <plearn/io/TypesNumeriques.h> 00004 #include <plearn/base/stringutils.h> 00005 00006 using namespace PLearn; 00007 00008 // usage : full_filter proper_nouns stop_words < in.text > filtered_out.text 00009 00010 set<string> proper_nouns; 00011 set<string> stop_words; 00012 WordNetOntology wn; 00013 00014 set<string> extractWordSet(string file) 00015 { 00016 set<string> words; 00017 ifstream in(file.c_str()); 00018 while (!in.eof()) 00019 { 00020 string line = pgetline(in); 00021 if (line == "") continue; 00022 words.insert(line); 00023 } 00024 in.close(); 00025 return words; 00026 } 00027 00028 bool isPunctuation(string& word) 00029 { 00030 for (unsigned int i = 0; i < word.size(); i++) 00031 { 00032 if (isAlpha(word[i])) 00033 return false; 00034 } 00035 return true; 00036 } 00037 00038 int main(int argc, char** argv) 00039 { 00040 if (argc != 3) 00041 PLERROR("usage : full_filter proper_nouns stop_words < in.text > filtered_out.text"); 00042 proper_nouns = extractWordSet(argv[1]); 00043 //stop_words = extractWordSet(argv[2]); 00044 string word; 00045 while (true) 00046 { 00047 cin >> word; 00048 if (!cin) break; 00049 word = lowerstring(word); 00050 /*if (stop_words.find(word) != stop_words.end()) 00051 cout << STOP_TAG << endl; 00052 else*/ 00053 if (looksNumeric(word.c_str())) 00054 cout << NUMERIC_TAG << endl; 00055 else if (isPunctuation(word)) 00056 cout << PUNCTUATION_TAG << endl; 00057 else if ((proper_nouns.find(word) != proper_nouns.end()) && !wn.isInWordNet(word)) 00058 cout << PROPER_NOUN_TAG << endl; 00059 else 00060 cout << word << endl; 00061 } 00062 }