Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

VMatrix.h

Go to the documentation of this file.
00001 // -*- C++ -*- 00002 // PLearn (A C++ Machine Learning Library) 00003 // Copyright (C) 1998 Pascal Vincent 00004 // Copyright (C) 1999-2001 Pascal Vincent, Yoshua Bengio, Rejean Ducharme and University of Montreal 00005 // Copyright (C) 2002 Pascal Vincent, Julien Keable, Xavier Saint-Mleux 00006 // 00007 // Redistribution and use in source and binary forms, with or without 00008 // modification, are permitted provided that the following conditions are met: 00009 // 00010 // 1. Redistributions of source code must retain the above copyright 00011 // notice, this list of conditions and the following disclaimer. 00012 // 00013 // 2. Redistributions in binary form must reproduce the above copyright 00014 // notice, this list of conditions and the following disclaimer in the 00015 // documentation and/or other materials provided with the distribution. 00016 // 00017 // 3. The name of the authors may not be used to endorse or promote 00018 // products derived from this software without specific prior written 00019 // permission. 00020 // 00021 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00022 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00023 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00024 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00025 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00026 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00027 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00028 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00029 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00030 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00031 // 00032 // This file is part of the PLearn library. For more information on the PLearn 00033 // library, go to the PLearn Web site at www.plearn.org 00034 00035 00036 /* ******************************************************* 00037 * $Id: VMatrix.h,v 1.52 2004/07/21 20:33:22 tihocan Exp $ 00038 ******************************************************* */ 00039 00040 00043 #ifndef VMatrix_INC 00044 #define VMatrix_INC 00045 00046 #include <cstdlib> 00047 #include <map> 00048 #include <plearn/base/PP.h> 00049 #include <plearn/math/TMat.h> 00050 #include <plearn/var/VarArray.h> 00051 #include <plearn/io/IntVecFile.h> 00052 #include <plearn/math/StatsCollector.h> 00053 #include <plearn/math/TMat_maths_impl.h> 00054 #include "VMField.h" 00055 00056 namespace PLearn { 00057 using namespace std; 00058 00059 class Ker; 00060 class VMat; 00061 class Func; 00062 00065 class VMatrix: public Object 00066 { 00067 00068 private: 00069 00070 typedef Object inherited; 00071 friend class VMat; 00072 00073 mutable FILE* lockf_; 00074 00076 mutable Vec get_row; 00077 00079 mutable Vec dotrow_1; 00080 mutable Vec dotrow_2; 00081 00082 protected: 00083 00084 int length_; 00085 int width_; 00086 time_t mtime_; // time of "last modification" of files containing the data 00087 00088 // For training/testing data sets we assume each row is composed of 3 parts 00089 // An input part, a target part, and a weight part 00090 // These fields give those parts' lengths 00091 // They are used by method VMat:: 00092 00093 int inputsize_; 00094 int targetsize_; 00095 int weightsize_; 00096 00097 // are write operations tolerated? 00098 bool writable; 00099 00102 string metadatadir; 00103 00104 // [DEPRECATED] contains a short name that can be used as part of a filename for results associated with this dataset. 00105 string alias_; 00106 00107 // New set of statistics fields: 00108 mutable TVec<StatsCollector> field_stats; 00109 00110 // the string mapping for each fields, in both directions 00111 mutable TVec<map<string,real> > map_sr; 00112 mutable TVec<map<real,string> > map_rs; 00113 00114 private: 00115 00117 void build_(); 00118 00119 public: 00120 00121 mutable Array<VMField> fieldinfos; // don't use this directly (deprecated...) call getFieldInfos() instead 00122 Array<VMFieldStat> fieldstats; 00123 00124 VMatrix(); 00125 00126 VMatrix(int the_length, int the_width); 00127 00128 // simply calls inherited::build() then build_() 00129 virtual void build(); 00130 00131 static void declareOptions(OptionList & ol); 00132 00133 void init_map_sr() const { if (map_sr.length()==0) { map_sr.resize(width()); map_rs.resize(width()); } } 00134 00135 // Data-set info 00136 // Sample parts sizes 00137 00138 00139 inline void defineSizes(int inputsize, int targetsize, int weightsize=0) 00140 { inputsize_ = inputsize, targetsize_ = targetsize, weightsize_ = weightsize; } 00141 00144 void copySizesFrom(VMat m); 00145 00147 bool looksTheSameAs(VMat m); 00148 00149 inline int inputsize() const { return inputsize_; } 00150 inline int targetsize() const { return targetsize_; } 00151 inline int weightsize() const { return weightsize_; } 00152 inline bool hasWeights() const { return weightsize_>0; } 00153 00157 virtual void getExample(int i, Vec& input, Vec& target, real& weight); 00158 00170 #define SPECIAL_FORMAT ((real)3.1e36) 00171 00172 // Field types... 00173 void setFieldInfos(const Array<VMField>& finfo); 00175 bool hasFieldInfos() const; 00177 Array<VMField>& getFieldInfos() const; 00178 VMField& getFieldInfos(int fieldindex) const { return getFieldInfos()[fieldindex]; } 00179 void declareField(int fieldindex, const string& fieldname, VMField::FieldType fieldtype=VMField::UnknownType); 00180 void declareFieldNames(TVec<string> fnames); 00181 00184 int fieldIndex(const string& fieldname) const; 00185 00187 //|< If this fails, the given string is assumed to hold the numerical index, and its 00189 int getFieldIndex(const string& fieldname_or_num) const; 00190 00191 string fieldName(int fieldindex) const { return getFieldInfos(fieldindex).name; } 00192 TVec<string> fieldNames() const; // Returns the vector of field names 00193 void unduplicateFieldNames(); // add a numeric suffix to duplic. fieldNames (eg: field.1 field.2 etc..) 00194 00195 VMField::FieldType fieldType(int fieldindex) const { return getFieldInfos(fieldindex).fieldtype; } 00196 VMField::FieldType fieldType(const string& fieldname) const { return fieldType(fieldIndex(fieldname)); } 00197 const VMFieldStat& fieldStat(int j) const { return fieldstats[j]; } 00198 const VMFieldStat& fieldStat(const string& fieldname) const { return fieldStat(fieldIndex(fieldname)); } 00199 00200 void printFields(ostream& out) const; 00201 void printFieldInfo(ostream& out, int fieldnum) const; 00202 void printFieldInfo(ostream& out, const string& fieldname_or_num) const; 00203 00204 string fieldheader(int elementcharwidth=8); 00205 00206 // loads/saves from/to the metadatadir/fieldnames file 00207 void saveFieldInfos() const; 00208 void loadFieldInfos() const; 00209 00210 // these 3 functions deal with stringmaps, notes, and binning files (all three called special field info files, or 'SFIF') 00211 // for each field eventually, I (julien) guess all this info should be wrapped (thus saved, and loaded) in the VMField class 00212 00213 // SFIFs, are by default located in the directory MyDataset.{amat,vmat,etc}.metadata/FieldInfo/ and are named 'fieldname'.{smap,notes,binning,...}. 00214 // In all 3 functions, the parameter ext (given **with** the dot) specifies the extension of the special field info file [smap,notes,binning], and col 00215 // is the column index you refer to. 00216 00217 // setSFIFFilename : sets the SFIF with extensions 'ext' to some 'string'. if this string is different 00218 // from the default filename, the string is actually placed in a new file called [dataset].metadata/FieldInfo/fieldname.[ext].lnk 00219 // if the 'string' is empty, the default SFIF filename is assumed, which is : [MyDataset].metadata/FieldInfo/fieldname.[ext] 00220 void setSFIFFilename(int col, string ext, string filepath=""); 00221 void setSFIFFilename(string fieldname, string ext, string filepath=""); 00222 00223 // getSFIFFilename :If a '*.vmat' dataset uses fields from another dataset, how can we keep the field info dependency? To resolve 00224 // this issue, a file named __default.lnk containing path 'P' can be placed in the FieldInfo directory of the .vmat. Here's how 00225 // the function getSFIFFilename search for a file : if the default SFIF file doesn't exist, it will then search for the default filename +'.lnk'. 00226 // if the later neither exists, the __default.lnk file is used if present, and if not, then an empty (thus inexistent) file (with 00227 // SFIF default filename) is assumed. 00228 string getSFIFFilename(int col, string ext); 00229 string getSFIFFilename(string fieldname, string ext); 00230 00231 // isSFIFDirect : tells whether the SFIF filename is the default filename. (if false, means the field uses the SFIF from another dataset) 00232 bool isSFIFDirect(int col, string ext); 00233 bool isSFIFDirect(string fieldname, string ext); 00234 00235 // string mapping stuff 00237 00238 // save all string mapings (one .smap file for each field) 00239 void saveAllStringMappings(); 00240 00241 // save a single field's string mapping in file 'fname' 00242 void saveStringMappings(int col,string fname); 00243 00245 void addStringMapping(int col, string str, real val); 00246 00249 real addStringMapping(int col, string str); 00250 00252 void removeAllStringMappings(); 00253 00255 void removeColumnStringMappings(int c); 00256 00258 void removeStringMapping(int col, string str); 00259 00261 void setStringMapping(int col, const map<string,real>& zemap); 00262 00264 void deleteStringMapping(int col); 00265 00267 void loadStringMapping(int col); 00268 00270 void loadAllStringMappings(); 00271 00273 void copyStringMappingsFrom(VMat source); 00274 00277 virtual string getValString(int col, real val) const; 00278 00280 virtual const map<string,real>& getStringToRealMapping(int col) const; 00281 00283 virtual const map<real,string>& getRealToStringMapping(int col) const; 00284 00286 virtual real getStringVal(int col, const string & str) const; 00287 00289 virtual string getString(int row, int col) const; 00290 00292 00293 virtual void computeStats(); 00294 bool hasStats() const { return fieldstats.size()>0; } 00295 void saveStats(const string& filename) const; 00296 void loadStats(const string& filename); 00297 00301 virtual void setMetaDataDir(const string& the_metadatadir); 00302 00304 bool hasMetaDataDir() const { return metadatadir!=""; } 00305 00307 string getMetaDataDir() const; 00308 00315 void lockMetaDataDir() const; 00316 00319 void unlockMetaDataDir() const; 00320 00323 string getAlias() const { return alias_; } 00324 void setAlias(const string& the_alias) { alias_ = the_alias; } 00325 00328 TVec<StatsCollector> getStats() const; 00329 00330 StatsCollector& getStats(int fieldnum) const 00331 { return getStats()[fieldnum]; } 00332 00335 TVec<RealMapping> getRanges(); 00336 00340 // PP<ConditionalStatsCollector> getConditionalStats(int condfield); 00341 00342 // default version calls savePMAT 00343 virtual void save(const string& filename) const; 00344 00345 virtual void savePMAT(const string& pmatfile) const; 00346 virtual void saveDMAT(const string& dmatdir) const; 00350 virtual void saveAMAT(const string& amatfile, bool verbose=true, bool no_header = false) const; 00351 00352 inline int width() const 00353 { 00354 #ifdef BOUNDCHECK 00355 if (!this) 00356 PLERROR("VMATRIX::width() This object has pointer this=NULL"); 00357 #endif 00358 return width_; 00359 } 00360 inline int length() const 00361 { 00362 #ifdef BOUNDCHECK 00363 if (!this) 00364 PLERROR("VMATRIX::length() This object has pointer this=NULL"); 00365 #endif 00366 return length_; 00367 } 00368 00369 inline bool isWritable() const { return writable; } 00370 00371 // this function (used with .vmat datasets), is used to return the filename of fieldInfo files (string maps (.smap) and notes (.notes)) 00372 // It recursively navigates through links until it finds a suitable file (.smap or .notes) 00373 // Idea : a .metadata/FieldInfo can contain one of these files : 00374 // (the order show here is the one used by the function to searches the file) 00375 00376 // fieldName.smap.lnk : containing the actual path+target OR another .lnk file 00377 // fieldName.smap : the target (the actual string map or comment file) 00378 // __default.lnk : contains another FieldInfo directory to look for target (typically the 00379 00380 // ** Note 1: that target is assumed to be an inexistant file in the directory where none of the previous 3 can be found (since the file exists only when non-empty) 00381 // ** Note 2: source may not be target 00382 string resolveFieldInfoLink(string target, string source); 00383 00389 inline time_t getMtime() const { return mtime_; } 00390 00394 inline void setMtime(time_t t) { mtime_ = t; } 00395 00397 virtual real get(int i, int j) const; 00398 00400 virtual void put(int i, int j, real value); 00401 00404 virtual void getSubRow(int i, int j, Vec v) const; 00405 00410 virtual void putSubRow(int i, int j, Vec v); 00411 00413 virtual void appendRow(Vec v); 00414 00416 virtual void flush(); 00417 00419 void putOrAppendRow(int i, Vec v); 00420 00424 void forcePutRow(int i, Vec v); 00425 00428 virtual void getRow(int i, Vec v) const; 00429 00430 virtual void putRow(int i, Vec v); 00431 virtual void fill(real value); 00432 virtual void getMat(int i, int j, Mat m) const; 00433 virtual void putMat(int i, int j, Mat m); 00434 00436 virtual void getColumn(int i, Vec v) const; 00437 00443 virtual bool find(const Vec& input, real tolerance, int* i = 0) const; 00444 00453 virtual Mat toMat() const; 00454 00457 virtual void compacify(); 00458 00460 virtual void reset_dimensions() {} 00461 00466 virtual VMat subMat(int i, int j, int l, int w); 00467 00474 virtual real dot(int i1, int i2, int inputsize) const; 00475 00476 inline real dot(int i1, int i2) const { return dot(i1,i2,width()); } 00477 00479 virtual real dot(int i, const Vec& v) const; 00480 00481 operator Mat() const { return toMat(); } 00482 00485 virtual void getRow(int i, VarArray& inputs) const; 00486 00487 void print(ostream& out) const; 00488 virtual void oldwrite(ostream& out) const; 00489 virtual void oldread(istream& in); 00490 00491 PLEARN_DECLARE_ABSTRACT_OBJECT(VMatrix); 00492 void makeDeepCopyFromShallowCopy(map<const void*, void*>& copies); 00493 00494 00505 virtual void evaluateKernel(Ker ker, int v1_startcol, int v1_ncols, 00506 const Vec& v2, const Vec& result, int startrow=0, int nrows=-1) const; 00507 00509 virtual real evaluateKernelSum(Ker ker, int v1_startcol, int v1_ncols, 00510 const Vec& v2, int startrow=0, int nrows=-1, int ignore_this_row=-1) const; 00511 00514 virtual real evaluateKernelWeightedTargetSum(Ker ker, int v1_startcol, int v1_ncols, const Vec& v2, 00515 int t_startcol, int t_ncols, Vec& targetsum, int startrow=0, int nrows=-1, int ignore_this_row=-1) const; 00516 00517 00522 virtual TVec< pair<real,int> > evaluateKernelTopN(int N, Ker ker, int v1_startcol, int v1_ncols, 00523 const Vec& v2, int startrow=0, int nrows=-1, int ignore_this_row=-1) const; 00524 00527 virtual TVec< pair<real,int> > evaluateKernelBottomN(int N, Ker ker, int v1_startcol, int v1_ncols, 00528 const Vec& v2, int startrow=0, int nrows=-1, int ignore_this_row=-1) const; 00529 00530 00535 virtual void accumulateXtY(int X_startcol, int X_ncols, int Y_startcol, int Y_ncols, 00536 Mat& result, int startrow=0, int nrows=-1, int ignore_this_row=-1) const; 00537 00538 00543 virtual void accumulateXtX(int X_startcol, int X_ncols, 00544 Mat& result, int startrow=0, int nrows=-1, int ignore_this_row=-1) const; 00545 00547 virtual void evaluateSumOfFprop(Func f, Vec& output_result, int nsamples=-1); 00548 virtual void evaluateSumOfFbprop(Func f, Vec& output_result, Vec& output_gradient, int nsamples=-1); 00549 00550 virtual ~VMatrix(); 00551 }; 00552 00553 DECLARE_OBJECT_PTR(VMatrix); 00554 00555 } // end of namespace PLearn 00556 00557 #endif 00558

Generated on Tue Aug 17 16:10:41 2004 for PLearn by doxygen 1.3.7