PLearn: TestInTrainSplitter.cc Source File

00001 // -*- C++ -*- 00002 00003 // TestInTrainSplitter.cc 00004 // 00005 // Copyright (C) 2004 Olivier Delalleau 00006 // 00007 // Redistribution and use in source and binary forms, with or without 00008 // modification, are permitted provided that the following conditions are met: 00009 // 00010 // 1. Redistributions of source code must retain the above copyright 00011 // notice, this list of conditions and the following disclaimer. 00012 // 00013 // 2. Redistributions in binary form must reproduce the above copyright 00014 // notice, this list of conditions and the following disclaimer in the 00015 // documentation and/or other materials provided with the distribution. 00016 // 00017 // 3. The name of the authors may not be used to endorse or promote 00018 // products derived from this software without specific prior written 00019 // permission. 00020 // 00021 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00022 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00023 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00024 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00025 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00026 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00027 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00028 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00029 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00030 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00031 // 00032 // This file is part of the PLearn library. For more information on the PLearn 00033 // library, go to the PLearn Web site at www.plearn.org 00034 00035 /* ******************************************************* 00036 * $Id: TestInTrainSplitter.cc,v 1.3 2004/06/08 13:16:29 tihocan Exp $ 00037 ******************************************************* */ 00038 00039 // Authors: Olivier Delalleau 00040 00044 #include "ConcatRowsVMatrix.h" 00045 #include "SubVMatrix.h" 00046 #include "TestInTrainSplitter.h" 00047 00048 namespace PLearn { 00049 using namespace std; 00050 00051 TestInTrainSplitter::TestInTrainSplitter() 00052 : percentage_added(0.1) 00053 /* ### Initialize all fields to their default value */ 00054 { 00055 // ... 00056 00057 // ### You may or may not want to call build_() to finish building the object 00058 // build_(); 00059 } 00060 00061 PLEARN_IMPLEMENT_OBJECT(TestInTrainSplitter, 00062 "A splitter that adds the test points given by another splitter into the training set.", 00063 "The underlying splitter should return train / test sets of constant size.\n" 00064 "For instance, if the underlying splitter returns 3 splits of (train,test)\n" 00065 "pairs with size 2000 and 500, this splitter will return:\n" 00066 " - for 'percentage_added' == 5%, 15 splits of size 2100 and 100, with each\n" 00067 " test point appearing once and only once in a train set and a test set\n" 00068 " - for 'percentage_added' == 20%, 6 splits of size 2400,400 and 2400,100, with\n" 00069 " each test point appearing once or more in a train set, and only once in a\n" 00070 " test set (note that the test points appearing more than once in a train set\n" 00071 " will be those at the beginning of the test sets returned by the underlying\n" 00072 " splitter)\n" 00073 ); 00074 00075 void TestInTrainSplitter::declareOptions(OptionList& ol) 00076 { 00077 // ### Declare all of this object's options here 00078 // ### For the "flags" of each option, you should typically specify 00079 // ### one of OptionBase::buildoption, OptionBase::learntoption or 00080 // ### OptionBase::tuningoption. Another possible flag to be combined with 00081 // ### is OptionBase::nosave 00082 00083 declareOption(ol, "percentage_added", &TestInTrainSplitter::percentage_added, OptionBase::buildoption, 00084 "The ratio between the number of examples in the test set added to the train set and the\n" 00085 "number of examples in the train set."); 00086 00087 declareOption(ol, "source_splitter", &TestInTrainSplitter::source_splitter, OptionBase::buildoption, 00088 "The underlying splitter."); 00089 00090 // Now call the parent class' declareOptions 00091 inherited::declareOptions(ol); 00092 } 00093 00094 void TestInTrainSplitter::build_() 00095 { 00096 // ### This method should do the real building of the object, 00097 // ### according to set 'options', in *any* situation. 00098 // ### Typical situations include: 00099 // ### - Initial building of an object from a few user-specified options 00100 // ### - Building of a "reloaded" object: i.e. from the complete set of all serialised options. 00101 // ### - Updating or "re-building" of an object after a few "tuning" options have been modified. 00102 // ### You should assume that the parent class' build_() has already been called. 00103 } 00104 00105 void TestInTrainSplitter::build() 00106 { 00107 inherited::build(); 00108 build_(); 00109 } 00110 00111 void TestInTrainSplitter::makeDeepCopyFromShallowCopy(map<const void*, void*>& copies) 00112 { 00113 Splitter::makeDeepCopyFromShallowCopy(copies); 00114 00115 // ### Call deepCopyField on all "pointer-like" fields 00116 // ### that you wish to be deepCopied rather than 00117 // ### shallow-copied. 00118 // ### ex: 00119 // deepCopyField(trainvec, copies); 00120 00121 // ### Remove this line when you have fully implemented this method. 00122 PLERROR("TestInTrainSplitter::makeDeepCopyFromShallowCopy not fully (correctly) implemented yet!"); 00123 } 00124 00126 // nsplits // 00128 int TestInTrainSplitter::nsplits() const 00129 { 00130 if (first_source_split.isEmpty()) { 00131 getFirstSplit(); // This is just to initialize n_train and n_test. 00132 } 00133 n_to_add = int(n_train * percentage_added + 0.5); 00134 if (n_to_add == 0) { 00135 // Do NOT add points in the train set. 00136 n_splits_per_source_split = 1; 00137 n_left = 0; 00138 } else { 00139 n_splits_per_source_split = n_test / n_to_add; 00140 n_left = n_test % n_to_add; 00141 } 00142 if (n_splits_per_source_split == 0) { 00143 PLERROR("In TestInTrainSplitter::nsplits - Asked to add more test samples than available"); 00144 } 00145 if (n_left > 0) 00146 n_splits_per_source_split++; 00147 int n_total = n_splits_per_source_split * source_splitter->nsplits(); 00148 return n_total; 00149 } 00150 00152 // nSetsPerSplit // 00154 int TestInTrainSplitter::nSetsPerSplit() const 00155 { 00156 return source_splitter->nSetsPerSplit(); 00157 } 00158 00160 // getFirstSplit // 00162 void TestInTrainSplitter::getFirstSplit() const { 00163 first_source_split = source_splitter->getSplit(0); 00164 n_train = first_source_split[0]->length(); 00165 n_test = first_source_split[1]->length(); 00166 } 00167 00169 // getSplit // 00171 TVec<VMat> TestInTrainSplitter::getSplit(int k) 00172 { 00173 TVec<VMat> source_split; 00174 if (first_source_split.isEmpty()) { 00175 getFirstSplit(); 00176 } 00177 if (k == 0) { 00178 source_split = first_source_split; 00179 } else { 00180 source_split = source_splitter->getSplit(k / n_splits_per_source_split); 00181 } 00182 int n_test_part = k % n_splits_per_source_split; 00183 int i_test_start = n_test_part * n_to_add; 00184 VMat train_set = source_split[0]; 00185 VMat test_set = source_split[1]; 00186 if (train_set->length() != n_train || test_set->length() != n_test) { 00187 PLERROR("In TestInTrainSplitter::getSplit - The train / test sizes have changed!"); 00188 } 00189 TVec<VMat> result(source_split.length()); 00190 if (n_to_add == 0) { 00191 // Do not change the split. 00192 result[0] = train_set; 00193 result[1] = test_set; 00194 } else if (n_left == 0 || n_test_part != n_splits_per_source_split - 1) { 00195 // Easy case: we add the same subset in train that the one for test. 00196 VMat added_to_train = new SubVMatrix(test_set, i_test_start, 0, n_to_add, test_set->width()); 00197 result[0] = vconcat(train_set, added_to_train); 00198 result[1] = added_to_train; 00199 } else { 00200 // We also take the beginning of the test to fill added_to_train, 00201 // so that we add the correct number of points in the training set. 00202 VMat new_test = new SubVMatrix(test_set, i_test_start, 0, n_left, test_set->width()); 00203 result[1] = new_test; 00204 VMat compl_for_train = new SubVMatrix(test_set, 0, 0, n_to_add - n_left, test_set->width()); 00205 VMat added_to_train = vconcat(new_test, compl_for_train); 00206 result[0] = vconcat(train_set, added_to_train); 00207 } 00208 for (int i = 2; i < result.length(); i++) { 00209 result[i] = source_split[i]; 00210 } 00211 return result; 00212 } 00213 00215 // setDataSet // 00217 void TestInTrainSplitter::setDataSet(VMat the_dataset) { 00218 first_source_split.resize(0); 00219 inherited::setDataSet(the_dataset); 00220 source_splitter->setDataSet(the_dataset); 00221 } 00222 00223 } // end of namespace PLearn