00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00044
#include "SelectRowsVMatrix.h"
00045
#include <plearn/var/SumOverBagsVariable.h>
00046
#include "ToBagSplitter.h"
00047
00048
namespace PLearn {
00049
using namespace std;
00050
00052
00054 ToBagSplitter::ToBagSplitter()
00055 :
Splitter(),
00056 expected_size_of_bag(10)
00057 {}
00058
00059
PLEARN_IMPLEMENT_OBJECT(
ToBagSplitter,
00060
"A Splitter that makes any existing splitter operate on bags only.",
00061
"The dataset provided must contain bag information, as described in\n"
00062
"SumOverBagsVariable");
00063
00065
00067 void ToBagSplitter::declareOptions(
OptionList& ol)
00068 {
00069
declareOption(ol,
"expected_size_of_bag", &ToBagSplitter::expected_size_of_bag, OptionBase::buildoption,
00070
"The expected size of each bag. It is not compulsory to change this option.");
00071
00072
declareOption(ol,
"sub_splitter", &ToBagSplitter::sub_splitter, OptionBase::buildoption,
00073
"The underlying splitter we want to make operate on bags.");
00074
00075
00076 inherited::declareOptions(ol);
00077 }
00078
00080
00082 void ToBagSplitter::build()
00083 {
00084 inherited::build();
00085
build_();
00086 }
00087
00089
00091 void ToBagSplitter::build_()
00092 {
00093
if (dataset) {
00094
00095
int max_ninstances = 1;
00096
00097
00098
00099
Mat bags_store(dataset->
length() /
expected_size_of_bag + 1,
expected_size_of_bag + 1);
00100
int num_bag = 0;
00101
int num_instance = 0;
00102
int bag_signal_column = dataset->inputsize() + dataset->targetsize() - 1;
00103
for (
int i = 0; i < dataset->
length(); i++) {
00104
if (num_instance + 1 >= bags_store.
width()) {
00105
if (num_instance > 10*(
expected_size_of_bag+1))
00106
PLERROR(
"ToBagSplitter: found bag size (%d) more than 10 times bigger than expected_size_of_bag (%d)!\n",
00107 num_instance,
expected_size_of_bag);
00108
00109 bags_store.
resize(bags_store.
length(), bags_store.
width() * 2);
00110 }
00111
if (num_instance >= max_ninstances) {
00112 max_ninstances = num_instance + 1;
00113 }
00114 bags_store(num_bag, num_instance + 1) = i;
00115 num_instance++;
00116
if (
int(dataset->get(i, bag_signal_column)) & SumOverBagsVariable::TARGET_COLUMN_LAST) {
00117
00118 bags_store(num_bag, 0) = num_instance;
00119 num_bag++;
00120 num_instance = 0;
00121
if (num_bag >= bags_store.
length()) {
00122
00123 bags_store.
resize(bags_store.
length() * 2, bags_store.
width());
00124 }
00125 }
00126 }
00127
00128 bags_store.
resize(num_bag, max_ninstances + 1);
00129
bags_index =
VMat(bags_store);
00130
00131
sub_splitter->setDataSet(
bags_index);
00132 }
00133 }
00134
00136
00138 TVec<VMat> ToBagSplitter::getSplit(
int k)
00139 {
00140
00141
TVec<VMat> sub_splits =
sub_splitter->getSplit(
k);
00142
TVec<VMat> result;
00143
for (
int i = 0; i < sub_splits.
length(); i++) {
00144
00145
Mat indices = sub_splits[i].
toMat();
00146
00147
TVec<int> indices_int;
00148
for (
int j = 0; j < indices.
length(); j++) {
00149
for (
int k = 0;
k < indices(j, 0);
k++) {
00150
int indice =
int(indices(j,
k + 1));
00151 indices_int.
append(indice);
00152 }
00153 }
00154 result.
append(
new SelectRowsVMatrix(dataset, indices_int));
00155 }
00156
return result;
00157 }
00158
00160
00162 void ToBagSplitter::makeDeepCopyFromShallowCopy(map<const void*, void*>& copies)
00163 {
00164 Splitter::makeDeepCopyFromShallowCopy(copies);
00165
00166
00167
00168
00169
00170
00171
00172
00173
PLERROR(
"ToBagSplitter::makeDeepCopyFromShallowCopy not fully (correctly) implemented yet!");
00174 }
00175
00177
00179 int ToBagSplitter::nSetsPerSplit()
const
00180
{
00181
00182
return sub_splitter->nSetsPerSplit();
00183 }
00184
00186
00188 int ToBagSplitter::nsplits()
const
00189
{
00190
return sub_splitter->nsplits();
00191 }
00192
00194
00196 void ToBagSplitter::setDataSet(
VMat the_dataset) {
00197 inherited::setDataSet(the_dataset);
00198
00199
build();
00200 }
00201
00202 }