Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

MultiInstanceVMatrix.cc

Go to the documentation of this file.
00001 // -*- C++ -*- 00002 00003 // MultiInstanceVMatrix.cc 00004 // 00005 // Copyright (C) 2004 Norman Casagrande 00006 // 00007 // Redistribution and use in source and binary forms, with or without 00008 // modification, are permitted provided that the following conditions are met: 00009 // 00010 // 1. Redistributions of source code must retain the above copyright 00011 // notice, this list of conditions and the following disclaimer. 00012 // 00013 // 2. Redistributions in binary form must reproduce the above copyright 00014 // notice, this list of conditions and the following disclaimer in the 00015 // documentation and/or other materials provided with the distribution. 00016 // 00017 // 3. The name of the authors may not be used to endorse or promote 00018 // products derived from this software without specific prior written 00019 // permission. 00020 // 00021 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00022 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00023 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00024 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00025 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00026 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00027 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00028 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00029 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00030 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00031 // 00032 // This file is part of the PLearn library. For more information on the PLearn 00033 // library, go to the PLearn Web site at www.plearn.org 00034 00035 /* ******************************************************* 00036 * $Id: MultiInstanceVMatrix.cc,v 1.15 2004/07/21 16:30:55 chrish42 Exp $ 00037 ******************************************************* */ 00038 00039 // Authors: Norman Casagrande 00040 00043 #include <map> 00044 #include <algorithm> 00045 #include <iterator> 00046 #include "MultiInstanceVMatrix.h" 00047 #include <plearn/var/SumOverBagsVariable.h> 00048 #include <plearn/base/stringutils.h> 00049 #include <plearn/io/fileutils.h> 00050 00051 namespace PLearn { 00052 using namespace std; 00053 00054 MultiInstanceVMatrix::MultiInstanceVMatrix() 00055 :inherited(), data_(Mat()), source_targetsize(1), 00056 header_lines_to_skip(0) 00057 { 00058 // ### You may or may not want to call build_() to finish building the object 00059 //build_(); 00060 } 00061 00062 //MultiInstanceVMatrix::MultiInstanceVMatrix(const string& filename) 00063 // :inherited(), filename_(abspath(filename)) 00064 //{ 00065 // //build(); 00066 //} 00067 00068 00069 PLEARN_IMPLEMENT_OBJECT(MultiInstanceVMatrix, "Virtual Matrix for a multi instance dataset", 00070 "In a multi-instance dataset examples come in 'bags' with only one target label\n" 00071 "for each bag. This class is built upon a source text file that describes such\n" 00072 "a dataset (see the help on the 'filename' option for format details).\n" 00073 "The resulting VMatrix shows the following structure in its rows, with\n" 00074 "all the rows of a bag being consecutive. Each row represents an instance and has:\n" 00075 " - the input features for the instance\n" 00076 " - the bag's source_targetsize target values (repeated over bag instances)\n" 00077 " - a bag signal integer that identifies the beginning and end of the bag:\n" 00078 " 1 means the first instance of the bag\n" 00079 " 2 means the last instance of the bag\n" 00080 " 3 is for a bag with a single row (= 1+2)\n" 00081 " 0 is for intermediate instances.\n" 00082 "The targetsize of the VMatrix is automatically set to source_targetsize+1\n" 00083 "since the bag_signal is included (appended) in the target vector\n" 00084 ); 00085 00086 void MultiInstanceVMatrix::getNewRow(int i, const Vec& v) const 00087 { 00088 v << data_(i); 00089 } 00090 00091 void MultiInstanceVMatrix::declareOptions(OptionList& ol) 00092 { 00093 declareOption(ol, "source_targetsize", &MultiInstanceVMatrix::source_targetsize, OptionBase::buildoption, 00094 "The source targetsize"); 00095 00096 declareOption(ol, "filename", &MultiInstanceVMatrix::filename_, OptionBase::buildoption, 00097 "This is the name of the ascii 'mimat' format filename. It is a supervised learning dataset\n" 00098 "in which each input object can come in several instances (e.g. conformations) and the target is given to the\n" 00099 "whole bag of these instances, not to individual instances. The expected format is the following:\n" 00100 "Each row contains:\n" 00101 " - the object name (a string without white space)\n" 00102 " - the instance number (a non-negative integer)\n" 00103 " - the inputsize features for that instance (numeric, white-separated)\n" 00104 " - the source_targetsize target values for the bag (repeated on each row).\n" 00105 "If the inputsize option is not specified it is inferred from the text file.\n" 00106 ); 00107 00108 declareOption(ol, "header_lines_to_skip", &MultiInstanceVMatrix::header_lines_to_skip, OptionBase::buildoption, 00109 "The number of lines to skip at the beginning of the file (they may be garbage, or \n" 00110 "a header for a TextFilesVMatrix for instance)."); 00111 00112 // Now call the parent class' declareOptions 00113 inherited::declareOptions(ol); 00114 } 00115 00116 void MultiInstanceVMatrix::build_() 00117 { 00118 //this->setMetaDataDir(filename_ + ".metadata"); 00119 00120 // To be used in the end.. it is about 5 secs slower in debug 00121 //int nRows = countNonBlankLinesOfFile(filename_); 00122 00123 ifstream inFile(filename_.c_str()); 00124 if(!inFile) 00125 PLERROR("In MultiInstanceVMatrix could not open file %s for reading", filename_.c_str()); 00126 00127 inFile.seekg(0); 00128 skipBlanksAndComments(inFile); 00129 00130 string lastName = ""; 00131 string newName; 00132 string aLine; 00133 string inp_element; 00134 int configNum, bagType; 00135 int nComp = 0; 00136 00137 int i; 00138 00139 real* mat_i = NULL; 00140 00141 // one more column for the bag signal 00142 targetsize_ = source_targetsize + 1; 00143 00144 // Check the number of columns 00145 for (i = 0; i < header_lines_to_skip; i++) { 00146 getline(inFile, aLine, '\n'); 00147 } 00148 getline(inFile, aLine, '\n'); 00149 vector<string> entries = split(aLine); 00150 int nFields = (int)entries.size(); 00151 if (inputsize_>=0) 00152 { 00153 if ( (nFields-2) != inputsize_ + source_targetsize) // 2 for the object name and the instance number 00154 { 00155 PLERROR("Either inputsize or source_targetsize are inconsistent with the specified file!\n" 00156 " Got %d+%d (inputsize+source_targetsize) = %d, and found %d! If unsure about inputsize, don't specify it or set to -1.", 00157 inputsize_, source_targetsize, inputsize_+source_targetsize, nFields - 2); 00158 } 00159 } else inputsize_ = nFields-2-source_targetsize; 00160 00161 int lastColumn = inputsize_ + source_targetsize; 00162 00163 inFile.seekg(0); 00164 skipBlanksAndComments(inFile); 00165 for (i = 0; i < header_lines_to_skip; i++) { 00166 getline(inFile, aLine, '\n'); 00167 } 00168 skipBlanksAndComments(inFile); 00169 00170 int nRows = count(istreambuf_iterator<char>(inFile), 00171 istreambuf_iterator<char>(), '\n'); 00172 00173 inFile.seekg(0); 00174 skipBlanksAndComments(inFile); 00175 for (i = 0; i < header_lines_to_skip; i++) { 00176 getline(inFile, aLine, '\n'); 00177 } 00178 skipBlanksAndComments(inFile); 00179 00180 data_.resize(nRows, inputsize_ + targetsize_); 00181 00182 width_ = inputsize_ + targetsize_; 00183 length_ = nRows; 00184 00185 for (int lineNum = 0; !inFile.eof() && lineNum < nRows; ++lineNum) 00186 { 00187 // first column: name of the compound 00188 inFile >> newName; 00189 if (newName != lastName) 00190 { 00191 lastName = newName; 00192 names_.push_back( make_pair(newName, lineNum) ); 00193 bagType = SumOverBagsVariable::TARGET_COLUMN_FIRST; 00194 00195 if (mat_i != NULL) 00196 { 00197 if (nComp > 1) 00198 mat_i[lastColumn] = SumOverBagsVariable::TARGET_COLUMN_LAST; 00199 else 00200 mat_i[lastColumn] = SumOverBagsVariable::TARGET_COLUMN_SINGLE; 00201 } 00202 nComp = 0; 00203 } 00204 else 00205 { 00206 bagType = SumOverBagsVariable::TARGET_COLUMN_INTERMEDIATE; 00207 } 00208 nComp++; 00209 00210 // get next column: the number of the compound 00211 inFile >> configNum; 00212 00213 configs_.push_back(configNum); 00214 00215 // get the actual data columns + the target 00216 mat_i = data_[lineNum]; 00217 for(int i = 0; i < inputsize_ + source_targetsize; i++) 00218 { 00219 inFile >> inp_element; 00220 mat_i[i] = strtod(inp_element.c_str(), 0); 00221 } 00222 00223 // close the last bag if necessary 00224 if (lineNum+1==nRows) 00225 { 00226 if (nComp > 1) 00227 mat_i[lastColumn] = SumOverBagsVariable::TARGET_COLUMN_LAST; 00228 else 00229 mat_i[lastColumn] = SumOverBagsVariable::TARGET_COLUMN_SINGLE; 00230 } 00231 else 00232 mat_i[lastColumn] = bagType; 00233 } 00234 00235 00236 //ofstream test("g:/test.txt"); 00237 //test << data_; 00238 //test.close(); 00239 00240 this->setMtime(mtime(filename_)); 00241 inFile.close(); 00242 } 00243 00244 // ### Nothing to add here, simply calls build_ 00245 void MultiInstanceVMatrix::build() 00246 { 00247 inherited::build(); 00248 build_(); 00249 } 00250 00251 void MultiInstanceVMatrix::makeDeepCopyFromShallowCopy(map<const void*, void*>& copies) 00252 { 00253 inherited::makeDeepCopyFromShallowCopy(copies); 00254 00255 // ### Call deepCopyField on all "pointer-like" fields 00256 // ### that you wish to be deepCopied rather than 00257 // ### shallow-copied. 00258 // ### ex: 00259 00260 deepCopyField(data_, copies); 00261 00262 // TODO: Copy also the other features 00263 00264 // ### Remove this line when you have fully implemented this method. 00265 PLERROR("MultiInstanceVMatrix::makeDeepCopyFromShallowCopy not fully implemented yet!"); 00266 } 00267 00268 } // end of namespace PLearn 00269

Generated on Tue Aug 17 15:59:20 2004 for PLearn by doxygen 1.3.7