00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00047
#ifndef DATABASES_INC
00048
#define DATABASES_INC
00049
00050
#include "UCISpecification.h"
00051
#include <plearn/vmat/VMat.h>
00052
00053
00054
namespace PLearn {
00055
using namespace std;
00056
00057
00059
Mat input2dSet(
const string& filename=
"data2d.amat");
00060
00062
void normalizeDataSets(Mat& training_set, Mat& test_set);
00063
void normalizeDataSets(Mat& training_set, Mat& validation_set, Mat& test_set);
00064
void normalizeDataSets(VMat& training_set, VMat& validation_set, VMat& test_set);
00065
00066
00067
void splitTrainValidTest(VMat &data_set,VMat &train_set,VMat &valid_set,
00068
real valid_fraction,VMat &test_set,
real test_fraction,
00069
bool normalize=
true);
00070
00071
00072 VMat
reduceInputSize(
real fraction,VMat data);
00073
00074
00075 VMat
reduceDataSetSize(
real fraction,VMat data);
00076
00078
void remapClassnums(VMat& data,
real remap_minval_to,
real remap_maxval_to);
00079
00081
int loadBreastCancer(VMat& training_set, VMat& validation_set, VMat& test_set,
int ntrain,
int nvalid,
bool uniq=
true);
00082
int loadDiabetes(VMat& training_set, VMat& validation_set, VMat& test_set,
int ntrain,
int nvalid);
00083
int loadATT800(VMat& training_set, VMat& test_set);
00084
int loadLetters(VMat& training_set, VMat& validation_set, VMat& test_set,
char* letters=
"ABCDEFGHIJKLMNOPQRSTUVWXYZ",
real validation_fraction=.2,
real test_fraction=.4,
bool do_shuffle=
true);
00085
int loadLetters(VMat& training_set, VMat& validation_set, VMat& test_set,
int n_letters,
real validation_fraction=.2,
real test_fraction=.4,
bool do_shuffle=
true);
00086
00087
void loadCorelDatamat(
int classnum, Mat& train, Mat& valid, Mat& test);
00088
Mat smoothCorelHisto(Mat& data);
00089
void loadCorel(Mat& training_set, Mat& validation_set, Mat& test_set,
int negative_class=2,
int positive_class=3);
00090
void loadCallxx(
int year, VMat& d);
00091
00092 VMat
loadBreastCancerWisconsin(
bool normalize=
true,
bool uniq=
true);
00093 VMat
loadSonar();
00094 VMat
loadIonosphere();
00095 VMat
loadDiabetes(
bool normalize=
true);
00096 VMat
loadPimaIndians(
bool normalize=
true);
00097 VMat
loadLetters(
const char* class0,
const char* class1,
bool normalize=
true);
00098 VMat
loadLetters(
bool normalize=
true);
00099 VMat
loadLetters(
int n_letters,
bool do_shuffle);
00100
void loadUSPS(VMat& trainset, VMat& testset,
bool use_smooth=
true);
00101 VMat
loadUSPS(
bool use_smooth=
true);
00102 VMat
loadHousing(
bool normalize=
true);
00107
void loadUCI(VMat& trainset, VMat& testset, VMat& allset,
string db_spec,
string id,
bool &normalize,
const string& type);
00109
void loadUCISet(VMat& data,
string file, PP<UCISpecification> uci_spec);
00111
void loadUCIAMat(VMat& data,
string file, PP<UCISpecification> uci_spec);
00112
00113
00127 inline string loadClassificationDatasetHelp()
00128 {
00129
return " Preprogrammed datasets are: \n"
00130
" 2d \n"
00131
" letters \n"
00132
" breast \n"
00133
" usps \n"
00134
" mnist \n"
00135
" usps0 ... usps9 \n"
00136
" nist0 ... usps9 \n"
00137
" They can optionally be followed by :size in which case only the 'size' \n"
00138
" first rows will be kept. \n";
00139 }
00140
00141
void loadClassificationDataset(
const string& dbname,
int& inputsize,
int& nclasses, VMat& trainset, VMat& testset,
bool normalizeinputs, VMat& allset);
00142
00143
00144 }
00145
00146
00147
#endif
00148
00149
00150
00151
00152
00153
00154