00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00043
#include <map>
00044
#include <algorithm>
00045
#include <iterator>
00046
#include "MultiInstanceVMatrix.h"
00047
#include <plearn/var/SumOverBagsVariable.h>
00048
#include <plearn/base/stringutils.h>
00049
#include <plearn/io/fileutils.h>
00050
00051
namespace PLearn {
00052
using namespace std;
00053
00054 MultiInstanceVMatrix::MultiInstanceVMatrix()
00055 :
inherited(), data_(
Mat()), source_targetsize(1),
00056 header_lines_to_skip(0)
00057 {
00058
00059
00060 }
00061
00062
00063
00064
00065
00066
00067
00068
00069
PLEARN_IMPLEMENT_OBJECT(
MultiInstanceVMatrix,
"Virtual Matrix for a multi instance dataset",
00070
"In a multi-instance dataset examples come in 'bags' with only one target label\n"
00071
"for each bag. This class is built upon a source text file that describes such\n"
00072
"a dataset (see the help on the 'filename' option for format details).\n"
00073
"The resulting VMatrix shows the following structure in its rows, with\n"
00074
"all the rows of a bag being consecutive. Each row represents an instance and has:\n"
00075
" - the input features for the instance\n"
00076
" - the bag's source_targetsize target values (repeated over bag instances)\n"
00077
" - a bag signal integer that identifies the beginning and end of the bag:\n"
00078
" 1 means the first instance of the bag\n"
00079
" 2 means the last instance of the bag\n"
00080
" 3 is for a bag with a single row (= 1+2)\n"
00081
" 0 is for intermediate instances.\n"
00082
"The targetsize of the VMatrix is automatically set to source_targetsize+1\n"
00083
"since the bag_signal is included (appended) in the target vector\n"
00084 );
00085
00086 void MultiInstanceVMatrix::getNewRow(
int i,
const Vec& v)
const
00087
{
00088 v <<
data_(i);
00089 }
00090
00091 void MultiInstanceVMatrix::declareOptions(
OptionList& ol)
00092 {
00093
declareOption(ol,
"source_targetsize", &MultiInstanceVMatrix::source_targetsize, OptionBase::buildoption,
00094
"The source targetsize");
00095
00096
declareOption(ol,
"filename", &MultiInstanceVMatrix::filename_, OptionBase::buildoption,
00097
"This is the name of the ascii 'mimat' format filename. It is a supervised learning dataset\n"
00098
"in which each input object can come in several instances (e.g. conformations) and the target is given to the\n"
00099
"whole bag of these instances, not to individual instances. The expected format is the following:\n"
00100
"Each row contains:\n"
00101
" - the object name (a string without white space)\n"
00102
" - the instance number (a non-negative integer)\n"
00103
" - the inputsize features for that instance (numeric, white-separated)\n"
00104
" - the source_targetsize target values for the bag (repeated on each row).\n"
00105
"If the inputsize option is not specified it is inferred from the text file.\n"
00106 );
00107
00108
declareOption(ol,
"header_lines_to_skip", &MultiInstanceVMatrix::header_lines_to_skip, OptionBase::buildoption,
00109
"The number of lines to skip at the beginning of the file (they may be garbage, or \n"
00110
"a header for a TextFilesVMatrix for instance).");
00111
00112
00113 inherited::declareOptions(ol);
00114 }
00115
00116 void MultiInstanceVMatrix::build_()
00117 {
00118
00119
00120
00121
00122
00123 ifstream inFile(
filename_.c_str());
00124
if(!inFile)
00125
PLERROR(
"In MultiInstanceVMatrix could not open file %s for reading",
filename_.c_str());
00126
00127 inFile.seekg(0);
00128
skipBlanksAndComments(inFile);
00129
00130
string lastName =
"";
00131
string newName;
00132
string aLine;
00133
string inp_element;
00134
int configNum, bagType;
00135
int nComp = 0;
00136
00137
int i;
00138
00139
real* mat_i = NULL;
00140
00141
00142 targetsize_ =
source_targetsize + 1;
00143
00144
00145
for (i = 0; i <
header_lines_to_skip; i++) {
00146 getline(inFile, aLine,
'\n');
00147 }
00148 getline(inFile, aLine,
'\n');
00149
vector<string> entries =
split(aLine);
00150
int nFields = (
int)entries.size();
00151
if (inputsize_>=0)
00152 {
00153
if ( (nFields-2) != inputsize_ + source_targetsize)
00154 {
00155
PLERROR(
"Either inputsize or source_targetsize are inconsistent with the specified file!\n"
00156
" Got %d+%d (inputsize+source_targetsize) = %d, and found %d! If unsure about inputsize, don't specify it or set to -1.",
00157 inputsize_, source_targetsize, inputsize_+source_targetsize, nFields - 2);
00158 }
00159 }
else inputsize_ = nFields-2-source_targetsize;
00160
00161
int lastColumn = inputsize_ + source_targetsize;
00162
00163 inFile.seekg(0);
00164
skipBlanksAndComments(inFile);
00165
for (i = 0; i < header_lines_to_skip; i++) {
00166 getline(inFile, aLine,
'\n');
00167 }
00168
skipBlanksAndComments(inFile);
00169
00170
int nRows =
count(istreambuf_iterator<char>(inFile),
00171 istreambuf_iterator<char>(),
'\n');
00172
00173 inFile.seekg(0);
00174
skipBlanksAndComments(inFile);
00175
for (i = 0; i < header_lines_to_skip; i++) {
00176 getline(inFile, aLine,
'\n');
00177 }
00178
skipBlanksAndComments(inFile);
00179
00180
data_.
resize(nRows, inputsize_ + targetsize_);
00181
00182 width_ = inputsize_ + targetsize_;
00183 length_ = nRows;
00184
00185
for (
int lineNum = 0; !inFile.eof() && lineNum < nRows; ++lineNum)
00186 {
00187
00188 inFile >> newName;
00189
if (newName != lastName)
00190 {
00191 lastName = newName;
00192
names_.push_back( make_pair(newName, lineNum) );
00193 bagType = SumOverBagsVariable::TARGET_COLUMN_FIRST;
00194
00195
if (mat_i != NULL)
00196 {
00197
if (nComp > 1)
00198 mat_i[lastColumn] = SumOverBagsVariable::TARGET_COLUMN_LAST;
00199
else
00200 mat_i[lastColumn] = SumOverBagsVariable::TARGET_COLUMN_SINGLE;
00201 }
00202 nComp = 0;
00203 }
00204
else
00205 {
00206 bagType = SumOverBagsVariable::TARGET_COLUMN_INTERMEDIATE;
00207 }
00208 nComp++;
00209
00210
00211 inFile >> configNum;
00212
00213
configs_.push_back(configNum);
00214
00215
00216 mat_i =
data_[lineNum];
00217
for(
int i = 0; i < inputsize_ + source_targetsize; i++)
00218 {
00219 inFile >> inp_element;
00220 mat_i[i] = strtod(inp_element.c_str(), 0);
00221 }
00222
00223
00224
if (lineNum+1==nRows)
00225 {
00226
if (nComp > 1)
00227 mat_i[lastColumn] = SumOverBagsVariable::TARGET_COLUMN_LAST;
00228
else
00229 mat_i[lastColumn] = SumOverBagsVariable::TARGET_COLUMN_SINGLE;
00230 }
00231
else
00232 mat_i[lastColumn] = bagType;
00233 }
00234
00235
00236
00237
00238
00239
00240 this->setMtime(
mtime(
filename_));
00241 inFile.close();
00242 }
00243
00244
00245 void MultiInstanceVMatrix::build()
00246 {
00247 inherited::build();
00248
build_();
00249 }
00250
00251 void MultiInstanceVMatrix::makeDeepCopyFromShallowCopy(map<const void*, void*>& copies)
00252 {
00253 inherited::makeDeepCopyFromShallowCopy(copies);
00254
00255
00256
00257
00258
00259
00260
deepCopyField(
data_, copies);
00261
00262
00263
00264
00265
PLERROR(
"MultiInstanceVMatrix::makeDeepCopyFromShallowCopy not fully implemented yet!");
00266 }
00267
00268 }
00269