00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00044
#include "ConcatRowsVMatrix.h"
00045
#include "SubVMatrix.h"
00046
#include "TestInTrainSplitter.h"
00047
00048
namespace PLearn {
00049
using namespace std;
00050
00051 TestInTrainSplitter::TestInTrainSplitter()
00052 : percentage_added(0.1)
00053
00054 {
00055
00056
00057
00058
00059 }
00060
00061
PLEARN_IMPLEMENT_OBJECT(
TestInTrainSplitter,
00062
"A splitter that adds the test points given by another splitter into the training set.",
00063
"The underlying splitter should return train / test sets of constant size.\n"
00064
"For instance, if the underlying splitter returns 3 splits of (train,test)\n"
00065
"pairs with size 2000 and 500, this splitter will return:\n"
00066
" - for 'percentage_added' == 5%, 15 splits of size 2100 and 100, with each\n"
00067
" test point appearing once and only once in a train set and a test set\n"
00068
" - for 'percentage_added' == 20%, 6 splits of size 2400,400 and 2400,100, with\n"
00069
" each test point appearing once or more in a train set, and only once in a\n"
00070
" test set (note that the test points appearing more than once in a train set\n"
00071
" will be those at the beginning of the test sets returned by the underlying\n"
00072
" splitter)\n"
00073 );
00074
00075 void TestInTrainSplitter::declareOptions(
OptionList& ol)
00076 {
00077
00078
00079
00080
00081
00082
00083
declareOption(ol,
"percentage_added", &TestInTrainSplitter::percentage_added, OptionBase::buildoption,
00084
"The ratio between the number of examples in the test set added to the train set and the\n"
00085
"number of examples in the train set.");
00086
00087
declareOption(ol,
"source_splitter", &TestInTrainSplitter::source_splitter, OptionBase::buildoption,
00088
"The underlying splitter.");
00089
00090
00091 inherited::declareOptions(ol);
00092 }
00093
00094 void TestInTrainSplitter::build_()
00095 {
00096
00097
00098
00099
00100
00101
00102
00103 }
00104
00105 void TestInTrainSplitter::build()
00106 {
00107 inherited::build();
00108
build_();
00109 }
00110
00111 void TestInTrainSplitter::makeDeepCopyFromShallowCopy(map<const void*, void*>& copies)
00112 {
00113 Splitter::makeDeepCopyFromShallowCopy(copies);
00114
00115
00116
00117
00118
00119
00120
00121
00122
PLERROR(
"TestInTrainSplitter::makeDeepCopyFromShallowCopy not fully (correctly) implemented yet!");
00123 }
00124
00126
00128 int TestInTrainSplitter::nsplits()
const
00129
{
00130
if (
first_source_split.
isEmpty()) {
00131
getFirstSplit();
00132 }
00133
n_to_add =
int(
n_train *
percentage_added + 0.5);
00134
if (
n_to_add == 0) {
00135
00136
n_splits_per_source_split = 1;
00137
n_left = 0;
00138 }
else {
00139
n_splits_per_source_split =
n_test /
n_to_add;
00140
n_left =
n_test %
n_to_add;
00141 }
00142
if (
n_splits_per_source_split == 0) {
00143
PLERROR(
"In TestInTrainSplitter::nsplits - Asked to add more test samples than available");
00144 }
00145
if (
n_left > 0)
00146
n_splits_per_source_split++;
00147
int n_total =
n_splits_per_source_split *
source_splitter->nsplits();
00148
return n_total;
00149 }
00150
00152
00154 int TestInTrainSplitter::nSetsPerSplit()
const
00155
{
00156
return source_splitter->nSetsPerSplit();
00157 }
00158
00160
00162 void TestInTrainSplitter::getFirstSplit()
const {
00163
first_source_split =
source_splitter->getSplit(0);
00164
n_train =
first_source_split[0]->
length();
00165
n_test = first_source_split[1]->length();
00166 }
00167
00169
00171 TVec<VMat> TestInTrainSplitter::getSplit(
int k)
00172 {
00173
TVec<VMat> source_split;
00174
if (
first_source_split.
isEmpty()) {
00175
getFirstSplit();
00176 }
00177
if (
k == 0) {
00178 source_split =
first_source_split;
00179 }
else {
00180 source_split =
source_splitter->getSplit(
k /
n_splits_per_source_split);
00181 }
00182
int n_test_part =
k %
n_splits_per_source_split;
00183
int i_test_start = n_test_part *
n_to_add;
00184
VMat train_set = source_split[0];
00185
VMat test_set = source_split[1];
00186
if (train_set->
length() !=
n_train || test_set->
length() !=
n_test) {
00187
PLERROR(
"In TestInTrainSplitter::getSplit - The train / test sizes have changed!");
00188 }
00189
TVec<VMat> result(source_split.
length());
00190
if (n_to_add == 0) {
00191
00192 result[0] = train_set;
00193 result[1] = test_set;
00194 }
else if (
n_left == 0 || n_test_part !=
n_splits_per_source_split - 1) {
00195
00196
VMat added_to_train =
new SubVMatrix(test_set, i_test_start, 0, n_to_add, test_set->
width());
00197 result[0] =
vconcat(train_set, added_to_train);
00198 result[1] = added_to_train;
00199 }
else {
00200
00201
00202
VMat new_test =
new SubVMatrix(test_set, i_test_start, 0,
n_left, test_set->
width());
00203 result[1] = new_test;
00204
VMat compl_for_train =
new SubVMatrix(test_set, 0, 0, n_to_add -
n_left, test_set->
width());
00205
VMat added_to_train =
vconcat(new_test, compl_for_train);
00206 result[0] =
vconcat(train_set, added_to_train);
00207 }
00208
for (
int i = 2; i < result.
length(); i++) {
00209 result[i] = source_split[i];
00210 }
00211
return result;
00212 }
00213
00215
00217 void TestInTrainSplitter::setDataSet(
VMat the_dataset) {
00218
first_source_split.
resize(0);
00219 inherited::setDataSet(the_dataset);
00220
source_splitter->setDataSet(the_dataset);
00221 }
00222
00223 }