00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
#include "AutoSDBVMatrix.h"
00046
00047
00048
#include <plearn/vmat/ConcatRowsVMatrix.h>
00049
#include "databases.h"
00050
#include <plearn/vmat/DiskVMatrix.h>
00051
#include <plearn/vmat/FileVMatrix.h>
00052
#include "getDataSet.h"
00053
00054
00055
#include <plearn/vmat/StrTableVMatrix.h>
00056
#include <plearn/base/StringTable.h>
00057
#include <plearn/vmat/VMat.h>
00058
#include <plearn/vmat/VVMatrix.h>
00059
00060
namespace PLearn {
00061
using namespace std;
00062
00063
00064 string getDataSetHelp()
00065 {
00066
return "Dataset specification can be one of: \n"
00067
" - the path to a matrix file (or directory) .amat .pmat .vmat .dmat or plain ascii \n"
00068
" - the basename of an .sdb \n"
00069
" - the object specification of a VMatrix subclass \n"
00070
" - the specification of a preprogrammed dataset i.e. one of the datasetnames below,\n"
00071
" followed by the word 'train' or 'test', optionally followed by the word 'normalize'\n"
00072 +
loadClassificationDatasetHelp() +
"\n";
00073 }
00074
00075 time_t
getDataSetDate(
const string& datasetstring,
const string& alias)
00076 {
00077
VMat vm;
00078
00079
00080 map<string,string> aliases =
getDatasetAliases(
getcwd());
00081
if(aliases.find(datasetstring)!=aliases.end())
00082
return getDataSetDate(aliases[datasetstring]);
00083
00084
if(
isfile(datasetstring+
".sdb"))
00085
return mtime(datasetstring+
".sdb");
00086
else if(
pathexists(datasetstring))
00087 {
00088
if(
isfile(datasetstring))
00089 {
00090
if(
extract_extension(datasetstring)==
".vmat")
00091
return VVMatrix::getDateOfVMat(datasetstring);
00092
else return mtime(datasetstring);
00093 }
00094
else
00095 {
00096
00097
if(
isfile(datasetstring+
slash+
"indexfile") &&
isfile(datasetstring+
slash+
"0.data"))
00098
return mtime(datasetstring+
slash+
"indexfile");
00099
else
00100
PLERROR(
"In getDataSetDate: datasetstring is a directory of unknown format");
00101 }
00102 }
00103
00104
00105
00106
return 0;
00107 }
00108
00109 VMat getDataSet(
const string& datasetstring,
const string& alias)
00110 {
00111
00112 map<string,string> aliases =
getDatasetAliases(
getcwd());
00113
if(aliases.find(datasetstring)!=aliases.end())
00114
return getDataSet(aliases[datasetstring]);
00115
00116
00117
VMat vm;
00118
if(
isfile(datasetstring+
".sdb"))
00119 {
00120 vm =
new AutoSDBVMatrix(datasetstring);
00121 }
00122
else if(
pathexists(datasetstring))
00123 {
00124
if(
isfile(datasetstring))
00125 {
00126
string ext =
extract_extension(datasetstring);
00127
if(ext==
".pmat")
00128 vm =
new FileVMatrix(datasetstring);
00129
else if(ext==
".vmat" || ext==
".txtmat")
00130 {
00131
string code =
readFileAndMacroProcess(datasetstring);
00132
if(
removeblanks(code)[0]==
'<')
00133 vm =
new VVMatrix(datasetstring);
00134
else
00135 {
00136 vm = dynamic_cast<VMatrix*>(
newObject(code));
00137
if(vm.
isNull())
00138
PLERROR(
"Object described in %s is not a VMatrix subclass",datasetstring.c_str());
00139 }
00140 }
00141
else if(ext==
".amat") {
00142
00143
if (datasetstring.find(
".bin.", ((
unsigned int) datasetstring.size()) - 9) != string::npos){
00144
Mat tempMat;
00145
loadAsciiSingleBinaryDescriptor(datasetstring,tempMat);
00146 vm =
VMat(tempMat);
00147 }
else {
00148 vm =
loadAsciiAsVMat(datasetstring);
00149 }
00150 }
00151
else if(ext==
".strtable")
00152 vm =
new StrTableVMatrix(
StringTable(datasetstring));
00153
else if(ext==
".sdb")
00154 vm =
new AutoSDBVMatrix(
remove_extension(datasetstring));
00155
else if(ext==
".mat")
00156 vm =
loadAsciiAsVMat(datasetstring);
00157
else
00158
PLERROR(
"Unknown extension for vmatrix: %s", ext.c_str());
00159 vm->setMetaDataDir(
extract_directory(datasetstring) +
extract_filename(datasetstring) +
".metadata");
00160 }
00161
else
00162 {
00163
00164
if(
isfile(datasetstring+
slash+
"indexfile") &&
isfile(datasetstring+
slash+
"0.data"))
00165 {
00166 vm =
new DiskVMatrix(datasetstring);
00167 }
00168
else
00169
PLERROR(
"In getDataSet: datasetstring is a directory of unknown format");
00170 }
00171 }
00172
else
00173 {
00174
try
00175 {
00176
vector<string> dsetspec =
split(datasetstring);
00177
if(dsetspec.size()<2)
00178
PLERROR(
"In getDataSet, expecting a specification of the form '<datasetname> <train|test|all> [normalize]. DatasetString = %s' ",datasetstring.c_str());
00179
string datasetname = dsetspec[0];
00180
bool normalizeinputs =
false;
00181
if(dsetspec.size()>=3)
00182 {
00183
if(dsetspec[2]==
"normalize")
00184 normalizeinputs =
true;
00185
else PLERROR(
"In getDataSet specification of predefined dataset contains 3 words, expecting 3rd one to be 'normalize', don't understand '%s'",dsetspec[2].
c_str());
00186 }
00187
00188
int inputsize, nclasses;
00189
VMat trainset, testset, allset;
00190
loadClassificationDataset(datasetname, inputsize, nclasses, trainset, testset, normalizeinputs, allset);
00191
if(dsetspec[1]==
"train") {
00192
if (trainset==NULL) {
00193
PLERROR(
"In getDataSet, there is no trainset available.");
00194 }
00195 vm = trainset;
00196 }
00197
else if(dsetspec[1]==
"test") {
00198
if (testset==NULL) {
00199
PLERROR(
"In getDataSet, there is no testset available.");
00200 }
00201 vm = testset;
00202 }
00203
else if(dsetspec[1]==
"all") {
00204
if (allset) {
00205 vm = allset;
00206 }
else {
00207 vm =
vconcat(trainset,testset);
00208 }
00209 }
00210
else
00211
PLERROR(
"In getDataSet specification of predefined dataset: expecting second word to be 'train' or 'test' or 'all' not %s ...",dsetspec[1].
c_str());
00212 vm->defineSizes(inputsize, 1);
00213 vm->setMetaDataDir(
"/u/lisa/db/metadata/" + datasetstring);
00214 }
00215
catch(
const PLearnError& e)
00216 {
00217
try
00218 {
00219 vm = dynamic_cast<VMatrix*>(
newObject(datasetstring));
00220
if(!vm)
00221
PLERROR(
"Not a VMatrix object (dynamic cast failed)");
00222 }
00223
catch(
const PLearnError& e2)
00224 {
00225
PLERROR(
"Error in getDataSet with specification: %s\n"
00226
"Specification is neither a valid file or directory \n"
00227
"Nor is it a preprogrammed dataset (attempt returned: %s)\n"
00228
"Nor could it be resolved to a VMatrix object (attempt returned: %s)\n",
00229 datasetstring.c_str(), e.message().c_str(), e2.message().c_str());
00230 }
00231 }
00232 }
00233
00234 vm->loadAllStringMappings();
00235 vm->setAlias(alias);
00236 vm->unduplicateFieldNames();
00237
return vm;
00238 }
00239
00240
00241 string locateDatasetAliasesDir(
const string& dir_or_file_path)
00242 {
00243
if(!
pathexists(dir_or_file_path))
00244
PLERROR(
"In getDatasetAliases argument '%s' is not an existing directory or file!", dir_or_file_path.c_str());
00245
string dirname =
extract_directory(
abspath(dir_or_file_path));
00246
string dot =
".";
00247
while(dirname!=
slash && dirname!=
dot+
slash && !
isfile(dirname +
"dataset.aliases"))
00248 dirname =
extract_directory(
remove_trailing_slash(dirname));
00249
00250
if(
isfile(dirname+
"dataset.aliases"))
00251
return dirname;
00252
else
00253
return "";
00254 }
00255
00258 map<string,string>
getDatasetAliases(
const string& dir_or_file_path)
00259 {
00260 map<string,string> aliases;
00261
string dirname =
locateDatasetAliasesDir(dir_or_file_path);
00262
if(!dirname.empty() &&
isfile(dirname+
"dataset.aliases"))
00263 {
00264
string fpath = dirname+
"dataset.aliases";
00265 ifstream in(fpath.c_str());
00266
if(!in)
00267
PLERROR(
"Could not open %s for reading", fpath.c_str());
00268
while(in)
00269 {
00270
string alias;
00271 getline(in,alias,
'=');
00272 alias =
removeblanks(alias);
00273
if(alias.empty())
00274
break;
00275
string datasetdef;
00276
PLearn::read(in,datasetdef);
00277 aliases[alias] = datasetdef;
00278 in >>
ws;
00279
if(in.peek()==
';')
00280 in.
get();
00281 }
00282 }
00283
return aliases;
00284 }
00285
00286
00287 }