GraphicalBiText.h
Go to the documentation of this file.00001
#include <plearn_learners/generic/Learner.h>
00002
#include <plearn_learners/language/WordNet/WordNetOntology.h>
00003
#include <plearn/math/random.h>
00004
#include <plearn/math/TMat_maths.h>
00005
#include <time.h>
00006
#include <plearn/math/ProbSparseMatrix.h>
00007
#include <plearn/vmat/SubVMatrix.h>
00008
#include <plearn_learners/language/TextSenseSequenceVMatrix.h>
00009
#include <plearn/vmat/SelectColumnsVMatrix.h>
00010
#include <plearn/vmat/DiskVMatrix.h>
00011
#include <plearn/vmat/ConcatRowsVMatrix.h>
00012
#include <plearn_learners/language/Smoothing/SmoothedProbSparseMatrix.h>
00013
#include <plearn_learners/language/Smoothing/ProbVector.h>
00014
#include <plearn/vmat/Splitter.h>
00015
00016
00017 #define DISCOUNT_MASS 0.7
00018
00019 #define MAX_EM_IT 10
00020 #define EM_THRES 0.001
00021 #define INIT_ALPHA 0.80
00022 #define INIT_P_A 0.1
00023 #define DEF_INTERP 0.5
00024
00025
namespace PLearn {
00026
using namespace std;
00027
00028
00029 VMat
loadToVMat(
string file,
string name,
int window,
int n_examples);
00030
00031
00032 class GraphicalBiText :
public Learner {
00033
00034
00035
public :
00036
00037
00038 int window_size;
00039 int n_train_examples;
00040 int n_test_examples;
00041 int n_epoch;
00042 string source_path;
00043 string source_voc;
00044 string target_voc;
00045 string train_file;
00046 string valid_file;
00047 string key_file;
00048 string sensemap_file;
00049 int sensemap_level;
00050 string semcor_train_path;
00051 string semcor_valid_path;
00052 string semcor_valid2_path;
00053 string semcor_test_path;
00054 string senseval2_train_path;
00055
00056
00057
00058 VMat wsd_train ;
00059 VMat wsd_valid ;
00060 VMat wsd_valid2;
00061 VMat wsd_test ;
00062 VMat senseval2_train;
00063
00064 real update_threshold;
00065
00066 string output_dir;
00067
00068
private :
00069
00070
00071
00072 ProbVector pS;
00073
00074 ProbVector pMC;
00075 Vec pC;
00076
00077 Vec pTC;
00078
00079 Vec pA;
00080 Vec nA;
00081
00082
00083 ProbSparseMatrix nFS;
00084 ProbSparseMatrix pFS;
00085
00086 ProbSparseMatrix nES;
00087 ProbSparseMatrix pES;
00088
00089
00090
00091 TMat<Set> commNode;
00092
00093 map<int, Set>
sens_to_conceptAncestors;
00094
00095
00096 map<int, Set>
target_word_to_senses;
00097
00098
00099
00100 ProbVector pF;
00101
00102 ProbVector pE;
00103
00104
00105 ProbSparseMatrix nEF;
00106 ProbSparseMatrix pEF;
00107
00108
00109 ProbSparseMatrix nSE;
00110 ProbSparseMatrix pSE;
00111 ProbSparseMatrix nSEbi;
00112 ProbSparseMatrix pSEbi;
00113 Vec KL;
00114
00115 map<int,bool>
BiSelect;
00116
00117
00118 map<string,string>
sensemap;
00119 map <int,int>
nodemap;
00120 map <int,int>
node_level;
00121
00122
00123
00124 int n_fields;
00125
00126 ProbVector pEbase;
00127 ProbVector pSbase;
00128 ProbVector pSupbi;
00129 Vec nS;
00130
00131 ProbVector pH;
00132 ProbVector pHbase;
00133 ProbVector pHupbi;
00134
00135 ProbSparseMatrix nESbase;
00136 ProbSparseMatrix nESupbi;
00137 SmoothedProbSparseMatrix pESbase;
00138 SmoothedProbSparseMatrix pESupbi;
00139
00140
00141 ProbSparseMatrix nHS;
00142 SmoothedProbSparseMatrix pHS;
00143 ProbSparseMatrix nHSupbi;
00144 SmoothedProbSparseMatrix pHSupbi;
00145
00146
00147 WordNetOntology ontology;
00148 int source_wsd_voc_size;
00149 int sense_size;
00150 int ss_size;
00151
00152
00153
00154 map<int, string>
source_id_to_word;
00155 map<string, int>
source_word_to_id;
00156 int source_voc_size;
00157
00158
00159 map<int, string>
target_id_to_word;
00160 map<string, int>
target_word_to_id;
00161 map<int,real>
target_id_to_proba;
00162 int target_voc_size;
00163
00164 Set target_wsd_voc;
00165 int target_wsd_voc_size;
00166
00167
00168 Vec train_bitext_tgt;
00169 Vec train_bitext_src;
00170 Vec valid_bitext_tgt;
00171 Vec valid_bitext_src;
00172
00173
00174
00175 real alpha_bn;
00176 real alpha_joint;
00177
00178
00179 Vec sum_epEC;
00180 Vec sum_fpFC;
00181 Vec sum_cpC;
00182
00183
00184
void compute_likelihood(
Vec bitext_src,
Vec bitext_tgt,
string name,
bool update);
00185
int getDeepestCommonAncestor(
int s1,
int s2);
00186
void compute_pTC();
00187
void compute_pTC(
int word);
00188
void distribute_pS_on_ancestors(
int s,
real probaToDistribute);
00189
void compute_node_level();
00190
void compute_pMC();
00191
void check_set_pA();
00192
void printNode(
int ss,ostream &out_hie);
00193
void update_pWS(
ProbSparseMatrix& ,
int ,
string);
00194
real compute_efs_likelihood(
int e,
int f,
int se);
00195
real compute_BN_likelihood(
int e,
int f,
bool update,
real nb);
00196
void optimize_interp_parameter(
Vec tgt,
Vec src,
string name);
00197
void loadBitext(
string train_file_name,
string valid_file_name,
bool update_voc);
00198
void compute_nodemap(
int split_level);
00199
void set_nodemap(
int node,
int word);
00200
void print_sensemap();
00201
void build_();
00202
void init_WSD();
00203
void init();
00204
public:
00205
00206
GraphicalBiText();
00207
virtual ~GraphicalBiText();
00208 typedef Learner inherited;
00209
PLEARN_DECLARE_OBJECT(
GraphicalBiText);
00210
00211
00212
static void declareOptions(
OptionList& ol);
00213
void build();
00214
00215 void use(
const Vec& input,
Vec& output) {
PLERROR(
"NaiveBayes does not know 'use', only 'computeOutput'"); }
00216
void train(
VMat training_set);
00217
void test();
00218
00219
void train(
int n_epoch);
00220
void senseTagBitext(
string name);
00221
void check_consitency();
00222
void print(
string name);
00223
void printHierarchy(
string name);
00224
void update_WSD_model(
string name);
00225
void sensetag_valid_bitext(
string name);
00226
void computeKL();
00227
void loadSensemap(
string sensemap_file);
00228
void compute_train_likelihood(
string name);
00229
void compute_valid_likelihood(
string name);
00230
00231
void test_WSD(
VMat wsd_test,
string name,
TVec<string> v,
bool select,
real interp = DEF_INTERP);
00232
void setTrainingSet(
VMat training_set,
bool call_forget);
00233 };
00234 }
00235
Generated on Tue Aug 17 15:54:54 2004 for PLearn by
1.3.7