Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

getDataSet.cc

Go to the documentation of this file.
00001 // -*- C++ -*- 00002 00003 // PLearn (A C++ Machine Learning Library) 00004 // Copyright (C) 1998 Pascal Vincent 00005 // Copyright (C) 1999,2000 Pascal Vincent, Yoshua Bengio and University of Montreal 00006 // 00007 00008 // Redistribution and use in source and binary forms, with or without 00009 // modification, are permitted provided that the following conditions are met: 00010 // 00011 // 1. Redistributions of source code must retain the above copyright 00012 // notice, this list of conditions and the following disclaimer. 00013 // 00014 // 2. Redistributions in binary form must reproduce the above copyright 00015 // notice, this list of conditions and the following disclaimer in the 00016 // documentation and/or other materials provided with the distribution. 00017 // 00018 // 3. The name of the authors may not be used to endorse or promote 00019 // products derived from this software without specific prior written 00020 // permission. 00021 // 00022 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00023 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00024 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00025 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00026 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00027 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00028 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00029 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00030 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00031 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00032 // 00033 // This file is part of the PLearn library. For more information on the PLearn 00034 // library, go to the PLearn Web site at www.plearn.org 00035 00036 00037 00038 /* ******************************************************* 00039 * $Id: getDataSet.cc,v 1.24 2004/07/21 16:30:51 chrish42 Exp $ 00040 * AUTHORS: Pascal Vincent 00041 * This file is part of the PLearn library. 00042 ******************************************************* */ 00043 00044 //#include "MatIO.h" 00045 #include "AutoSDBVMatrix.h" 00046 //#include "stringutils.h" 00047 //#include "fileutils.h" 00048 #include <plearn/vmat/ConcatRowsVMatrix.h> 00049 #include "databases.h" 00050 #include <plearn/vmat/DiskVMatrix.h> 00051 #include <plearn/vmat/FileVMatrix.h> 00052 #include "getDataSet.h" 00053 //#include "ConcatColumnsVMatrix.h" 00054 //#include "SubVMatrix.h" 00055 #include <plearn/vmat/StrTableVMatrix.h> 00056 #include <plearn/base/StringTable.h> 00057 #include <plearn/vmat/VMat.h> 00058 #include <plearn/vmat/VVMatrix.h> 00059 00060 namespace PLearn { 00061 using namespace std; 00062 00063 00064 string getDataSetHelp() 00065 { 00066 return "Dataset specification can be one of: \n" 00067 " - the path to a matrix file (or directory) .amat .pmat .vmat .dmat or plain ascii \n" 00068 " - the basename of an .sdb \n" 00069 " - the object specification of a VMatrix subclass \n" 00070 " - the specification of a preprogrammed dataset i.e. one of the datasetnames below,\n" 00071 " followed by the word 'train' or 'test', optionally followed by the word 'normalize'\n" 00072 + loadClassificationDatasetHelp() + "\n"; 00073 } 00074 00075 time_t getDataSetDate(const string& datasetstring, const string& alias) 00076 { 00077 VMat vm; 00078 00079 // search for an alias in a dataset.aliases file 00080 map<string,string> aliases = getDatasetAliases(getcwd()); 00081 if(aliases.find(datasetstring)!=aliases.end()) 00082 return getDataSetDate(aliases[datasetstring]); 00083 00084 if(isfile(datasetstring+".sdb")) // it's an sdb 00085 return mtime(datasetstring+".sdb"); 00086 else if(pathexists(datasetstring)) 00087 { 00088 if(isfile(datasetstring)) 00089 { 00090 if(extract_extension(datasetstring)==".vmat") 00091 return VVMatrix::getDateOfVMat(datasetstring); 00092 else return mtime(datasetstring); 00093 } 00094 else // it's a directory 00095 { 00096 // is it the directory of a DiskVMatrix? 00097 if(isfile(datasetstring+slash+"indexfile") && isfile(datasetstring+slash+"0.data")) 00098 return mtime(datasetstring+slash+"indexfile"); 00099 else 00100 PLERROR("In getDataSetDate: datasetstring is a directory of unknown format"); 00101 } 00102 } 00103 00104 // Otherwise, it could be a preprogrammed dataset, or just about anything... 00105 // we don't really know. So we return 0. 00106 return 0; 00107 } 00108 00109 VMat getDataSet(const string& datasetstring, const string& alias) 00110 { 00111 // search for an alias in a dataset.aliases file 00112 map<string,string> aliases = getDatasetAliases(getcwd()); 00113 if(aliases.find(datasetstring)!=aliases.end()) 00114 return getDataSet(aliases[datasetstring]); 00115 00116 // it wasn't an alias 00117 VMat vm; 00118 if(isfile(datasetstring+".sdb")) // it's an sdb (without the .sdb extension...) 00119 { 00120 vm = new AutoSDBVMatrix(datasetstring); 00121 } 00122 else if(pathexists(datasetstring)) 00123 { 00124 if(isfile(datasetstring)) 00125 { 00126 string ext = extract_extension(datasetstring); 00127 if(ext==".pmat") 00128 vm = new FileVMatrix(datasetstring); 00129 else if(ext==".vmat" || ext==".txtmat") 00130 { 00131 string code = readFileAndMacroProcess(datasetstring); 00132 if(removeblanks(code)[0]=='<') // old xml-like format 00133 vm = new VVMatrix(datasetstring); 00134 else 00135 { 00136 vm = dynamic_cast<VMatrix*>(newObject(code)); 00137 if(vm.isNull()) 00138 PLERROR("Object described in %s is not a VMatrix subclass",datasetstring.c_str()); 00139 } 00140 } 00141 else if(ext==".amat") { 00142 // Check if the extension is ".bin.amat". 00143 if (datasetstring.find(".bin.", ((unsigned int) datasetstring.size()) - 9) != string::npos){ 00144 Mat tempMat; 00145 loadAsciiSingleBinaryDescriptor(datasetstring,tempMat); 00146 vm = VMat(tempMat); 00147 } else { 00148 vm = loadAsciiAsVMat(datasetstring); 00149 } 00150 } 00151 else if(ext==".strtable") 00152 vm = new StrTableVMatrix(StringTable(datasetstring)); 00153 else if(ext==".sdb") 00154 vm = new AutoSDBVMatrix(remove_extension(datasetstring)); 00155 else if(ext==".mat") 00156 vm = loadAsciiAsVMat(datasetstring); 00157 else 00158 PLERROR("Unknown extension for vmatrix: %s", ext.c_str()); 00159 vm->setMetaDataDir(extract_directory(datasetstring) + extract_filename(datasetstring) + ".metadata"); 00160 } 00161 else // it's a directory 00162 { 00163 // is it the directory of a DiskVMatrix? 00164 if(isfile(datasetstring+slash+"indexfile") && isfile(datasetstring+slash+"0.data")) 00165 { 00166 vm = new DiskVMatrix(datasetstring); 00167 } 00168 else 00169 PLERROR("In getDataSet: datasetstring is a directory of unknown format"); 00170 } 00171 } 00172 else // it's either a preprogrammed dataset, or a VMatrix object 00173 { 00174 try // try with a preprogrammed dataset 00175 { 00176 vector<string> dsetspec = split(datasetstring); 00177 if(dsetspec.size()<2) 00178 PLERROR("In getDataSet, expecting a specification of the form '<datasetname> <train|test|all> [normalize]. DatasetString = %s' ",datasetstring.c_str()); 00179 string datasetname = dsetspec[0]; 00180 bool normalizeinputs = false; 00181 if(dsetspec.size()>=3) 00182 { 00183 if(dsetspec[2]=="normalize") 00184 normalizeinputs = true; 00185 else PLERROR("In getDataSet specification of predefined dataset contains 3 words, expecting 3rd one to be 'normalize', don't understand '%s'",dsetspec[2].c_str()); 00186 } 00187 00188 int inputsize, nclasses; 00189 VMat trainset, testset, allset; 00190 loadClassificationDataset(datasetname, inputsize, nclasses, trainset, testset, normalizeinputs, allset); 00191 if(dsetspec[1]=="train") { 00192 if (trainset==NULL) { 00193 PLERROR("In getDataSet, there is no trainset available."); 00194 } 00195 vm = trainset; 00196 } 00197 else if(dsetspec[1]=="test") { 00198 if (testset==NULL) { 00199 PLERROR("In getDataSet, there is no testset available."); 00200 } 00201 vm = testset; 00202 } 00203 else if(dsetspec[1]=="all") { 00204 if (allset) { 00205 vm = allset; 00206 } else { 00207 vm = vconcat(trainset,testset); 00208 } 00209 } 00210 else 00211 PLERROR("In getDataSet specification of predefined dataset: expecting second word to be 'train' or 'test' or 'all' not %s ...",dsetspec[1].c_str()); 00212 vm->defineSizes(inputsize, 1); 00213 vm->setMetaDataDir("/u/lisa/db/metadata/" + datasetstring); 00214 } 00215 catch(const PLearnError& e) // OK, it wasn't a preprogrammed dataset, let's try with a VMatrix object 00216 { 00217 try 00218 { 00219 vm = dynamic_cast<VMatrix*>(newObject(datasetstring)); 00220 if(!vm) 00221 PLERROR("Not a VMatrix object (dynamic cast failed)"); 00222 } 00223 catch(const PLearnError& e2) 00224 { 00225 PLERROR("Error in getDataSet with specification: %s\n" 00226 "Specification is neither a valid file or directory \n" 00227 "Nor is it a preprogrammed dataset (attempt returned: %s)\n" 00228 "Nor could it be resolved to a VMatrix object (attempt returned: %s)\n", 00229 datasetstring.c_str(), e.message().c_str(), e2.message().c_str()); 00230 } 00231 } 00232 } 00233 00234 vm->loadAllStringMappings(); 00235 vm->setAlias(alias); 00236 vm->unduplicateFieldNames(); 00237 return vm; 00238 } 00239 00240 00241 string locateDatasetAliasesDir(const string& dir_or_file_path) 00242 { 00243 if(!pathexists(dir_or_file_path)) 00244 PLERROR("In getDatasetAliases argument '%s' is not an existing directory or file!", dir_or_file_path.c_str()); 00245 string dirname = extract_directory(abspath(dir_or_file_path)); 00246 string dot = "."; 00247 while(dirname!=slash && dirname!=dot+slash && !isfile(dirname + "dataset.aliases")) 00248 dirname = extract_directory(remove_trailing_slash(dirname)); 00249 00250 if(isfile(dirname+"dataset.aliases")) 00251 return dirname; 00252 else 00253 return ""; 00254 } 00255 00258 map<string,string> getDatasetAliases(const string& dir_or_file_path) 00259 { 00260 map<string,string> aliases; 00261 string dirname = locateDatasetAliasesDir(dir_or_file_path); 00262 if(!dirname.empty() && isfile(dirname+"dataset.aliases")) 00263 { 00264 string fpath = dirname+"dataset.aliases"; 00265 ifstream in(fpath.c_str()); 00266 if(!in) 00267 PLERROR("Could not open %s for reading", fpath.c_str()); 00268 while(in) 00269 { 00270 string alias; 00271 getline(in,alias,'='); 00272 alias = removeblanks(alias); 00273 if(alias.empty()) 00274 break; 00275 string datasetdef; 00276 PLearn::read(in,datasetdef); 00277 aliases[alias] = datasetdef; 00278 in >> ws;//skipBlanks(in); 00279 if(in.peek()==';') 00280 in.get(); 00281 } 00282 } 00283 return aliases; 00284 } 00285 00286 00287 } // end of namespace PLearn

Generated on Tue Aug 17 15:54:24 2004 for PLearn by doxygen 1.3.7