00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00041
#include <plearn/math/random.h>
00042
#include "RepeatSplitter.h"
00043
#include "SelectRowsVMatrix.h"
00044
00045
namespace PLearn {
00046
using namespace std;
00047
00049
00051 RepeatSplitter::RepeatSplitter()
00052 :
00053 last_n(-1),
00054 do_not_shuffle_first(0),
00055 force_proportion(-1),
00056 n(1),
00057
seed(-1),
00058
shuffle(0)
00059 {
00060 }
00061
00062
PLEARN_IMPLEMENT_OBJECT(
RepeatSplitter,
00063
"Repeat a given splitter a certain amount of times, with the possibility to\n"
00064
"shuffle randomly the dataset each time",
00065
"NO HELP");
00066
00068
00070 void RepeatSplitter::declareOptions(
OptionList& ol)
00071 {
00072
declareOption(ol,
"do_not_shuffle_first", &RepeatSplitter::do_not_shuffle_first, OptionBase::buildoption,
00073
"If set to 1, then the dataset won't be shuffled the first time we do the splitting.\n"
00074
"It only makes sense to use this option if 'shuffle' is set to 1.");
00075
00076
declareOption(ol,
"force_proportion", &RepeatSplitter::force_proportion, OptionBase::buildoption,
00077
"If a target value appears at least once every x samples, will ensure that after\n"
00078
"shuffling it appears at least once every (x * 'force_proportion') samples, and not\n"
00079
"more than once every (x / 'force_proportion') samples. Will be ignored if < 1.\n"
00080
"Note that this currently only works for a binary target! (and hasn't been 100% tested).");
00081
00082
declareOption(ol,
"n", &RepeatSplitter::n, OptionBase::buildoption,
00083
"How many times we want to repeat.");
00084
00085
declareOption(ol,
"seed", &RepeatSplitter::seed, OptionBase::buildoption,
00086
"Initializes the random number generator (only if shuffle is set to 1).\n"
00087
"If set to -1, the initialization will depend on the clock.");
00088
00089
declareOption(ol,
"shuffle", &RepeatSplitter::shuffle, OptionBase::buildoption,
00090
"If set to 1, the dataset will be shuffled differently at each repetition.");
00091
00092
declareOption(ol,
"to_repeat", &RepeatSplitter::to_repeat, OptionBase::buildoption,
00093
"The splitter we want to repeat.");
00094
00095 inherited::declareOptions(ol);
00096 }
00097
00099
00101 void RepeatSplitter::build()
00102 {
00103 inherited::build();
00104
build_();
00105 }
00106
00108
00110 void RepeatSplitter::build_()
00111 {
00112
if (
shuffle && dataset) {
00113
00114
if (
seed >= 0)
00115
manual_seed(
seed);
00116
else
00117
PLearn::seed();
00118
int n_splits =
nsplits();
00119
indices =
TMat<int>(n_splits, dataset.
length());
00120
TVec<int> shuffled;
00121
for (
int i = 0; i < n_splits; i++) {
00122 shuffled =
TVec<int>(0, dataset.
length()-1, 1);
00123
00124
if (!
do_not_shuffle_first || i > 0) {
00125
shuffleElements(shuffled);
00126
if (
force_proportion >= 1) {
00127
00128
00129
StatsCollector tsc(2000);
00130
if (dataset->targetsize() != 1) {
00131
PLERROR(
"In RepeatSplitter::build_ - 'force_proportion' is only implemented for a 1-dimensional target");
00132 }
00133
real t;
00134
for (
int j = 0; j < dataset->
length(); j++) {
00135 t = dataset->get(j, dataset->inputsize());
00136 tsc.
update(t);
00137 }
00138 tsc.
finalize();
00139
00140
int count = (
int) tsc.
getCounts()->size() - 1;
00141
if (
count != 2) {
00142
PLERROR(
"In RepeatSplitter::build_ - 'force_proportion' is only implemented for a binary target");
00143 }
00144
00145
int j = 0;
00146
for (map<real,StatsCollectorCounts>::iterator it = tsc.
getCounts()->begin(); j <
count; j++) {
00147 t = it->first;
00148
real prop_t =
real(it->second.n) / real(dataset->
length());
00149
00150
00151
00152
00153
00154
00155
00156
int step = 20;
00157
bool ok =
false;
00158
while (!ok) {
00159
int n =
int(step * prop_t + 0.5);
00160
if (n >= 2 && n <= step - 10
00161 &&
abs(step * prop_t - real(n)) / real(step) < 0.01) {
00162 ok =
true;
00163 }
else {
00164
00165 step *= 2;
00166 }
00167 }
00168
int expected_count =
int(step * prop_t + 0.5);
00169
00170
00171 ok =
false;
00172
int tc = dataset->inputsize();
00173
while (!ok) {
00174 ok =
true;
00175
00176
int first_pass_step = int(step *
force_proportion + 0.5);
00177
int k,l;
00178
for (
k = 0;
k < shuffled.
length();
k += first_pass_step) {
00179
int count_target = 0;
00180
for (l =
k; l <
k + first_pass_step && l < shuffled.
length(); l++) {
00181
if (dataset->get(shuffled[l], tc) == t) {
00182 count_target++;
00183 }
00184 }
00185
if (l -
k == first_pass_step && count_target < expected_count) {
00186
00187 ok =
false;
00188
00189
for (
int m = 0; m < expected_count - count_target; m++) {
00190
bool can_swap =
false;
00191
int to_swap = -1;
00192
00193
while (!can_swap) {
00194 to_swap = int(
uniform_sample() * first_pass_step);
00195
if (dataset->get(shuffled[
k + to_swap], tc) != t) {
00196 can_swap =
true;
00197 }
00198 }
00199 to_swap +=
k;
00200
00201
int next =
k + first_pass_step - 1;
00202 can_swap =
false;
00203
while (!can_swap) {
00204 next++;
00205
if (next >= shuffled.
length()) {
00206 next = 0;
00207 }
00208
if (dataset->get(shuffled[next], tc) == t) {
00209 can_swap =
true;
00210 }
00211 }
00212
00213
int tmp = shuffled[next];
00214 shuffled[next] = shuffled[to_swap];
00215 shuffled[to_swap] = tmp;
00216 }
00217 }
00218 }
00219
00220
int second_pass_step = int(step / force_proportion + 0.5);
00221
for (
k = 0;
k < shuffled.
length();
k += second_pass_step) {
00222
int count_target = 0;
00223
for (l =
k; l <
k + second_pass_step && l < shuffled.
length(); l++) {
00224
if (dataset->get(shuffled[l], tc) == count_target) {
00225 count_target++;
00226 }
00227 }
00228
if (l -
k == second_pass_step && count_target > expected_count) {
00229
00230 ok =
false;
00231
PLWARNING(
"In RepeatSplitter::build_ - The code reached hasn't been tested yet");
00232
00233
for (
int m = 0; m < - expected_count + count_target; m++) {
00234
bool can_swap =
false;
00235
int to_swap =
k - 1;
00236
00237
while (!can_swap) {
00238 to_swap++;
00239
if (dataset->get(shuffled[to_swap], tc) == t) {
00240 can_swap =
true;
00241 }
00242 }
00243
00244
int next =
k + first_pass_step - 1;
00245 can_swap =
false;
00246
while (!can_swap) {
00247 next++;
00248
if (next >= shuffled.
length()) {
00249 next = 0;
00250 }
00251
if (dataset->get(shuffled[next], tc) != t) {
00252 can_swap =
true;
00253 }
00254 }
00255
00256
int tmp = shuffled[next];
00257 shuffled[next] = shuffled[to_swap];
00258 shuffled[to_swap] = tmp;
00259 }
00260 }
00261 }
00262 }
00263 it++;
00264 }
00265 }
00266 }
00267
indices(i) << shuffled;
00268 }
00269 }
else {
00270
indices =
TMat<int>();
00271 }
00272
last_n = -1;
00273 }
00274
00276
00278 void RepeatSplitter::makeDeepCopyFromShallowCopy(map<const void*, void*>& copies)
00279 {
00280 Splitter::makeDeepCopyFromShallowCopy(copies);
00281
00282
00283
00284
00285
00286
00287
00288
deepCopyField(
to_repeat, copies);
00289
00290 }
00291
00293
00295 TVec<VMat> RepeatSplitter::getSplit(
int k)
00296 {
00297
int n_splits = this->
nsplits();
00298
if (
k >= n_splits) {
00299
PLERROR(
"In RepeatSplitter::getSplit: split asked is too high");
00300 }
00301
int child_splits =
to_repeat->nsplits();
00302
int real_k =
k % child_splits;
00303
if (
shuffle && dataset) {
00304
int shuffle_indice =
k / child_splits;
00305
if (shuffle_indice !=
last_n) {
00306
00307
VMat m =
new SelectRowsVMatrix(dataset,
indices(shuffle_indice));
00308
to_repeat->setDataSet(m);
00309
last_n = shuffle_indice;
00310 }
00311 }
00312
return to_repeat->getSplit(real_k);
00313 }
00314
00316
00318 int RepeatSplitter::nSetsPerSplit()
const
00319
{
00320
return to_repeat->nSetsPerSplit();
00321 }
00322
00324
00326 int RepeatSplitter::nsplits()
const
00327
{
00328
return to_repeat->nsplits() *
n;
00329 }
00330
00332
00334 void RepeatSplitter::setDataSet(
VMat the_dataset) {
00335 inherited::setDataSet(the_dataset);
00336
to_repeat->setDataSet(the_dataset);
00337
build();
00338 }
00339
00340 }