00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00045
#include "TrainTestBagsSplitter.h"
00046
#include "SumOverBagsVariable.h"
00047
00048
namespace PLearn {
00049
using namespace std;
00050
00051 TrainTestBagsSplitter::TrainTestBagsSplitter(
real the_test_fraction)
00052 : append_train(0), test_fraction(the_test_fraction)
00053 {};
00054
00055
PLEARN_IMPLEMENT_OBJECT(
TrainTestBagsSplitter,
"Splits a dataset in two parts",
00056
"TrainTestBagsSplitter implements a single split of the dataset into\n"
00057
"a training set and a test set (the test part being the last few samples of the dataset)\n"
00058
"Optionally a third set is provided which is the training set itself (in order to test on it)\n");
00059
00060 void TrainTestBagsSplitter::declareOptions(
OptionList& ol)
00061 {
00062
declareOption(ol,
"append_train", &TrainTestBagsSplitter::append_train, OptionBase::buildoption,
00063
"if set to 1, the trainset will be appended after the test set (thus each split"
00064
" will contain three sets)");
00065
00066
declareOption(ol,
"test_fraction", &TrainTestBagsSplitter::test_fraction, OptionBase::buildoption,
00067
"the fraction of the dataset reserved to the test set");
00068
00069 inherited::declareOptions(ol);
00070 }
00071
00072 void TrainTestBagsSplitter::build_()
00073 {
00074 }
00075
00076
00077 void TrainTestBagsSplitter::build()
00078 {
00079 inherited::build();
00080
build_();
00081 }
00082
00083 int TrainTestBagsSplitter::nsplits()
const
00084
{
00085
return 1;
00086 }
00087
00088 int TrainTestBagsSplitter::nSetsPerSplit()
const
00089
{
00090
if (
append_train)
00091
return 3;
00092
else
00093
return 2;
00094 }
00095
00096 TVec<VMat> TrainTestBagsSplitter::getSplit(
int k)
00097 {
00098
00099
if (
k)
00100
PLERROR(
"TrainTestBagsSplitter::getSplit() - k cannot be greater than 0");
00101
00102
TVec<VMat> split_(2);
00103
00104
int l = dataset->
length();
00105
int test_length =
int(
test_fraction*l);
00106
00107
Vec v;
00108 dataset->getRow(test_length, v);
00109
00110
00111
PLWARNING(
"In TrainTestBagsSplitter::getSplit - I think it won't work (see TODO in code)");
00112
while ( v[dataset->
width()-1] != SumOverBagsVariable::TARGET_COLUMN_LAST)
00113 {
00114 ++test_length;
00115 dataset->getRow(test_length, v);
00116 }
00117
00118
int train_length = l - test_length;
00119
00120 split_[0] = dataset.
subMatRows(0, train_length);
00121 split_[1] = dataset.
subMatRows(train_length, test_length);
00122
if (
append_train) {
00123 split_.
resize(3);
00124 split_[2] = split_[0];
00125 }
00126
return split_;
00127 }
00128
00129 }