PLearn: CompactVMatrix.cc Source File

00001 // -*- C++ -*- 00002 00003 // PLearn (A C++ Machine Learning Library) 00004 // Copyright (C) 1998 Pascal Vincent 00005 // Copyright (C) 1999-2001 Pascal Vincent, Yoshua Bengio, Rejean Ducharme and University of Montreal 00006 // Copyright (C) 2002 Pascal Vincent, Julien Keable, Xavier Saint-Mleux 00007 // 00008 // Redistribution and use in source and binary forms, with or without 00009 // modification, are permitted provided that the following conditions are met: 00010 // 00011 // 1. Redistributions of source code must retain the above copyright 00012 // notice, this list of conditions and the following disclaimer. 00013 // 00014 // 2. Redistributions in binary form must reproduce the above copyright 00015 // notice, this list of conditions and the following disclaimer in the 00016 // documentation and/or other materials provided with the distribution. 00017 // 00018 // 3. The name of the authors may not be used to endorse or promote 00019 // products derived from this software without specific prior written 00020 // permission. 00021 // 00022 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00023 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00024 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00025 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00026 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00027 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00028 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00029 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00030 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00031 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00032 // 00033 // This file is part of the PLearn library. For more information on the PLearn 00034 // library, go to the PLearn Web site at www.plearn.org 00035 00036 00037 /* ******************************************************* 00038 * $Id: CompactVMatrix.cc,v 1.11 2004/07/21 16:30:55 chrish42 Exp $ 00039 ******************************************************* */ 00040 00041 #include "CompactVMatrix.h" 00042 #include <plearn/math/TMat_maths.h> 00043 #include <plearn/math/random.h> 00044 00045 namespace PLearn { 00046 using namespace std; 00047 00048 union short_and_twobytes 00049 { 00050 unsigned short us; 00051 unsigned char twobytes[2]; 00052 }; 00053 00056 // norman: added static "inizialization" 00057 unsigned char CompactVMatrix::n_bits_in_byte[256]; 00058 00059 void CompactVMatrix::set_n_bits_in_byte() 00060 { 00061 if (n_bits_in_byte[255]!=8) 00062 { 00063 for (int i=0;i<256;i++) 00064 { 00065 int n=0; 00066 unsigned char byte=i; 00067 for (int j=0;j<8;j++) 00068 { 00069 n += byte & 1; 00070 byte >>= 1; 00071 } 00072 n_bits_in_byte[i]=n; 00073 } 00074 } 00075 } 00076 00077 PLEARN_IMPLEMENT_OBJECT(CompactVMatrix, "ONE LINE DESCR", "NO HELP"); 00078 00079 CompactVMatrix::CompactVMatrix() 00080 : n_symbols(0), n_fixedpoint(0), n_variables(0), one_hot_encoding(true), n_symbol_values(0), 00081 fixedpoint_min(0), fixedpoint_max(0), delta(0), variables_permutation(0) 00082 { 00083 } 00084 00085 CompactVMatrix::CompactVMatrix(int the_length, int nvariables, int n_binary, 00086 int n_nonbinary_discrete, 00087 int n_fixed_point, TVec<int>& n_symbolvalues, 00088 Vec& fixed_point_min, Vec& fixed_point_max, 00089 bool onehot_encoding) 00090 : inherited(the_length,nvariables), n_bits(n_binary), 00091 n_symbols(n_nonbinary_discrete), n_fixedpoint(n_fixed_point), 00092 n_variables(nvariables), one_hot_encoding(onehot_encoding), 00093 n_symbol_values(n_symbolvalues), 00094 fixedpoint_min(fixed_point_min), fixedpoint_max(fixed_point_max), 00095 delta(n_fixed_point), variables_permutation(n_variables) 00096 { 00097 normal_width=n_bits+n_fixed_point; 00098 for (int i=0;i<n_symbols;i++) 00099 normal_width += n_symbol_values[i]; 00100 setOneHotMode(one_hot_encoding); 00101 for (int i=0;i<n_variables;i++) variables_permutation[i]=i; 00102 for (int i=0;i<n_symbols;i++) 00103 delta[i]=(fixedpoint_max[i]-fixedpoint_min[i])/USHRT_MAX; 00104 symbols_offset = (int)ceil(n_bits/8.0); 00105 fixedpoint_offset = symbols_offset + n_symbols; 00106 row_n_bytes = fixedpoint_offset + sizeof(unsigned short)*n_fixed_point; 00107 data.resize(length_ * row_n_bytes); 00108 set_n_bits_in_byte(); 00109 } 00110 00111 CompactVMatrix::CompactVMatrix(VMat m, int keep_last_variables_last, bool onehot_encoding) 00112 : inherited(m->length(),m->width()), one_hot_encoding(onehot_encoding), 00113 n_symbol_values(m->width()), variables_permutation(m->width()) 00114 { 00115 if (!m->hasStats()) 00116 { 00117 cout << "CompactVMatrix(VMat, int): VMat did not have stats. Computing them." << endl; 00118 m->computeStats(); 00119 } 00120 // determine which variables are binary discrete, multi-class discrete, or continuous 00121 n_bits = n_symbols = n_fixedpoint = 0; 00122 TVec<int> bits_position(m->width()); 00123 TVec<int> symbols_position(m->width()); 00124 TVec<int> fp_position(m->width()); 00125 fixedpoint_min.resize(m->width()); 00126 fixedpoint_max.resize(m->width()); 00127 delta.resize(m->width()); 00128 for (int i=0;i<m->width();i++) 00129 { 00130 VMFieldStat& stat = m->fieldstats[i]; 00131 int n_values = (int)stat.counts.size(); // 0 means "continuous" 00132 bool counts_look_continuous = !isMapKeysAreInt(stat.counts); 00133 if (n_values == 0 || counts_look_continuous || i>=m->width()-keep_last_variables_last) 00134 { 00135 fixedpoint_min[n_fixedpoint]=stat.min(); 00136 fixedpoint_max[n_fixedpoint]=stat.max(); 00137 delta[n_fixedpoint]=(stat.max()-stat.min())/USHRT_MAX; 00138 fp_position[n_fixedpoint++]=i; 00139 } 00140 else 00141 { 00142 if (stat.min()!=0 || (stat.max()-stat.min()+1)!=stat.counts.size()) 00143 PLERROR("CompactVMatrix:: variable %d looks discrete but has zero-frequency intermediate values or min!=0",i); 00144 if (n_values==2) 00145 bits_position[n_bits++]=i; 00146 else if (n_values<=256) 00147 { 00148 symbols_position[n_symbols]=i; 00149 n_symbol_values[n_symbols++] = n_values; 00150 } 00151 else 00152 { 00153 fixedpoint_min[n_fixedpoint]=stat.min(); 00154 fixedpoint_max[n_fixedpoint]=stat.max(); 00155 delta[n_fixedpoint]=(stat.max()-stat.min())/USHRT_MAX; 00156 fp_position[n_fixedpoint++]=i; 00157 } 00158 } 00159 } 00160 fixedpoint_min.resize(n_fixedpoint); 00161 fixedpoint_max.resize(n_fixedpoint); 00162 delta.resize(n_fixedpoint); 00163 n_symbol_values.resize(n_symbols); 00164 n_variables = n_bits + n_symbols + n_fixedpoint; 00165 int j=0; 00166 for (int i=0;i<n_bits;i++,j++) 00167 variables_permutation[j]=bits_position[i]; 00168 for (int i=0;i<n_symbols;i++,j++) 00169 variables_permutation[j]=symbols_position[i]; 00170 for (int i=0;i<n_fixedpoint;i++,j++) 00171 variables_permutation[j]=fp_position[i]; 00172 00173 normal_width=n_bits+n_fixedpoint; 00174 for (int i=0;i<n_symbols;i++) 00175 normal_width += n_symbol_values[i]; 00176 setOneHotMode(one_hot_encoding); 00177 symbols_offset = (int)ceil(n_bits/8.0); 00178 fixedpoint_offset = symbols_offset + n_symbols; 00179 row_n_bytes = fixedpoint_offset + sizeof(unsigned short)*n_fixedpoint; 00180 data.resize(length_ * row_n_bytes); 00181 00182 // copy the field infos and stats? not really useful with one-hot encoding 00183 // because of non-binary symbols being spread across many columns. 00184 if (!one_hot_encoding) 00185 { 00186 fieldinfos.resize(width_); 00187 fieldstats.resize(width_); 00188 for (int i=0;i<width_;i++) 00189 { 00190 fieldinfos[i]=m->getFieldInfos()[variables_permutation[i]]; 00191 fieldstats[i]=m->fieldstats[variables_permutation[i]]; 00192 } 00193 } 00194 else 00195 { 00196 fieldinfos.resize(0); 00197 fieldstats.resize(0); 00198 } 00199 00200 // copy the data 00201 Vec mrow(m->width()); 00202 for (int t=0;t<length_;t++) 00203 { 00204 m->getRow(t,mrow); 00205 encodeAndPutRow(t,mrow); 00206 } 00207 set_n_bits_in_byte(); 00208 } 00209 00210 00211 CompactVMatrix::CompactVMatrix(const string& filename, int nlast) : RowBufferedVMatrix(0,0) 00212 { 00213 load(filename); 00214 n_last=nlast; 00215 set_n_bits_in_byte(); 00216 } 00217 00218 CompactVMatrix::CompactVMatrix(CompactVMatrix* cvm, VMat m, bool rescale, bool check) 00219 : inherited(m->length(),m->width()) 00220 { 00221 if (cvm->width() != m->width()) 00222 PLERROR("CompactVMatrix::CompactVMatrix(CompactVMatrix* cvm, VMat m,...), args have widths %d!=%d", 00223 cvm->width(),m->width()); 00224 // copy all the ordinary fields 00225 row_n_bytes = cvm->row_n_bytes; 00226 data.resize(length_*row_n_bytes); 00227 n_bits = cvm->n_bits; 00228 n_symbols = cvm->n_symbols; 00229 n_fixedpoint = cvm->n_fixedpoint; 00230 n_variables = cvm->n_variables; 00231 n_symbol_values = cvm->n_symbol_values; 00232 fixedpoint_min = cvm->fixedpoint_min.copy(); 00233 fixedpoint_max = cvm->fixedpoint_max.copy(); 00234 delta = cvm->delta.copy(); 00235 variables_permutation = cvm->variables_permutation; 00236 n_last = cvm->n_last; 00237 normal_width = cvm->normal_width; 00238 symbols_offset = cvm->symbols_offset; 00239 fixedpoint_offset = cvm->fixedpoint_offset; 00240 00241 setOneHotMode(cvm->one_hot_encoding); 00242 Vec row(width_); 00243 int offs=width_-n_fixedpoint; 00244 if (rescale) 00245 { 00246 for (int i=0;i<length_;i++) 00247 { 00248 m->getRow(i,row); 00249 for (int j=0;j<n_fixedpoint;j++) 00250 { 00251 real element=row[offs+j]; 00252 if (element<fixedpoint_min[j]) 00253 fixedpoint_min[j]=element; 00254 if (element>fixedpoint_max[j]) 00255 fixedpoint_max[j]=element; 00256 } 00257 } 00258 for (int j=0;j<n_fixedpoint;j++) 00259 delta[j]=(fixedpoint_max[j]-fixedpoint_min[j])/USHRT_MAX; 00260 } 00261 for (int i=0;i<length_;i++) 00262 { 00263 m->getRow(i,row); 00264 if (!rescale && check) // check that range is OK 00265 { 00266 for (int j=0;j<n_fixedpoint;j++) 00267 { 00268 real element=row[offs+j]; 00269 if (element<fixedpoint_min[j] || 00270 element>fixedpoint_max[j]) 00271 PLERROR("CompactVMatrix::CompactVMatrix(CompactVMatrix* cvm, VMat m,...) out-of-range element(%d,%d)=%g not in [%g,%g]", 00272 i,j,element,fixedpoint_min[j],fixedpoint_max[j]); 00273 } 00274 } 00275 putRow(i,row); 00276 } 00277 } 00278 00279 void CompactVMatrix::setOneHotMode(bool on) 00280 { 00281 one_hot_encoding=on; 00282 if (one_hot_encoding) 00283 width_ = normal_width; 00284 else 00285 width_ = n_variables; 00286 } 00287 00288 void CompactVMatrix::getNewRow(int i, const Vec& v) const 00289 { 00290 #ifdef BOUNDCHECK 00291 if (i<0 || i>=length_) 00292 PLERROR("CompactVMatrix::getNewRow, row %d out of bounds [0,%d]",i,length_-1); 00293 if (v.length()!=width_) 00294 PLERROR("CompactVMatrix::getNewRow, length of v (%d) should be equal to width of VMat (%d)",v.length(),width()); 00295 #endif 00296 unsigned char* encoded_row = &data.data[i*row_n_bytes]; 00297 real* vp=v.data(); 00298 int c=0; 00299 for (int b=0;b<symbols_offset;b++) 00300 { 00301 unsigned char byte=encoded_row[b]; 00302 for (int j=0;j<8 && c<n_bits;j++,c++) 00303 { 00304 int bit = byte & 1; 00305 byte >>= 1; // shift right once 00306 vp[c]=bit; 00307 } 00308 } 00309 for (int b=0;b<n_symbols;b++) 00310 { 00311 int byte = encoded_row[symbols_offset+b]; 00312 if (one_hot_encoding) 00313 { 00314 int n=n_symbol_values[b]; 00315 for (int j=0;j<n;j++) vp[c+j]=0; 00316 vp[c+byte]=1; 00317 c+=n; 00318 } 00319 else vp[c++]=byte; 00320 } 00321 unsigned char* fixed_point_numbers = &encoded_row[fixedpoint_offset]; 00322 for (int j=0;j<n_fixedpoint;j++,c++) 00323 { 00324 unsigned char *uc = &fixed_point_numbers[2*j]; 00325 short_and_twobytes u; 00326 u.twobytes[0]=uc[0]; 00327 u.twobytes[1]=uc[1]; 00328 real decoded = u.us*delta[j]+fixedpoint_min[j]; 00329 // correct rounding errors for integers, due to fixed-point low precision 00330 real rounded_decoded = rint(decoded); 00331 if (fabs(rounded_decoded-decoded)<1e-4) 00332 decoded = rounded_decoded; 00333 vp[c]=decoded; 00334 } 00335 } 00336 00337 //#define SANITYCHECK_CompactVMatrix 00338 #define SANITYCHECK_CompactVMatrix_PRECISION 1e-5 00339 00340 real CompactVMatrix::dot(int i, int j, int inputsize) const 00341 { 00342 if(inputsize!=width()-n_last) 00343 PLERROR("In CompactVMatrix::dot, in current implementation inputsize must be equal to width()-n_last"); 00344 00345 unsigned char* encoded_row_i = &data.data[i*row_n_bytes]; 00346 unsigned char* encoded_row_j = &data.data[j*row_n_bytes]; 00347 real dot_product=0.; 00348 int c=0; 00349 for (int b=0;b<symbols_offset;b++) 00350 { 00351 unsigned char byte_i=encoded_row_i[b]; 00352 unsigned char byte_j=encoded_row_j[b]; 00353 unsigned char byte_and = byte_i & byte_j; 00354 #ifdef SANITYCHECK_CompactVMatrix 00355 real check=dot_product; 00356 #endif 00357 // Here we want to count the number of ON bits in the byte_and 00358 // instead of looping through the bits, we look-up in a 00359 // pre-computed table (n_bits_in_byte), which has been set-up by set_n_bits_in_byte(). 00360 dot_product += n_bits_in_byte[byte_and]; 00361 #ifdef SANITYCHECK_CompactVMatrix 00362 for (int j=0;j<8 && c<n_bits;j++,c++) 00363 { 00364 check += byte_and & 1; 00365 byte_and >>= 1; // shift right once 00366 } 00367 if (check!=dot_product) 00368 PLERROR("logic error in n_bits_in_byte"); 00369 #else 00370 c+=8; 00371 if (c>n_bits) c=n_bits; 00372 #endif 00373 } 00374 if (c>width_-n_last) 00375 PLERROR("CompactVMatrix: n_last should be among discrete non-binary or continuous variables"); 00376 for (int b=0;b<n_symbols && c<width_-n_last;b++) 00377 { 00378 int byte_i = encoded_row_i[symbols_offset+b]; 00379 int byte_j = encoded_row_j[symbols_offset+b]; 00380 if (byte_i==byte_j) dot_product++; 00381 if (one_hot_encoding) 00382 c+=n_symbol_values[b]; 00383 else 00384 c++; 00385 } 00386 unsigned char* fixed_point_numbers_i = &encoded_row_i[fixedpoint_offset]; 00387 unsigned char* fixed_point_numbers_j = &encoded_row_j[fixedpoint_offset]; 00388 for (int k=0;k<n_fixedpoint-n_last && c<width_-n_last;k++,c++) 00389 { 00390 unsigned char *uc = &fixed_point_numbers_i[2*k]; 00391 short_and_twobytes u; 00392 u.twobytes[0]=uc[0]; 00393 u.twobytes[1]=uc[1]; 00394 real decoded_i = u.us*delta[k]+fixedpoint_min[k]; 00395 uc = &fixed_point_numbers_j[2*k]; 00396 u.twobytes[0]=uc[0]; 00397 u.twobytes[1]=uc[1]; 00398 real decoded_j = u.us*delta[k]+fixedpoint_min[k]; 00399 #ifdef SANITYCHECK_CompactVMatrix 00400 real rounded_decoded_i = rint(decoded_i); 00401 if (fabs(rounded_decoded_i-decoded_i)<1e-4) 00402 decoded_i = rounded_decoded_i; 00403 real rounded_decoded_j = rint(decoded_j); 00404 if (fabs(rounded_decoded_j-decoded_j)<1e-4) 00405 decoded_j = rounded_decoded_j; 00406 #endif 00407 dot_product += decoded_i * decoded_j; 00408 } 00409 00410 return dot_product; 00411 } 00412 00413 // I used the code for getRow as a basis to implement this call (Pascal) 00414 real CompactVMatrix::dot(int i, const Vec& v) const 00415 { 00416 #ifdef BOUNDCHECK 00417 if (i<0 || i>=length_) 00418 PLERROR("CompactVMatrix::dot, row %d out of bounds [0,%d]",i,length_-1); 00419 #endif 00420 00421 if(v.length()!=width()-n_last) 00422 PLERROR("In CompactVMatrix::dot, in current implementation v.length() must be equal to width()-n_last"); 00423 00424 real dot_product = 0.; 00425 00426 unsigned char* encoded_row = &data.data[i*row_n_bytes]; 00427 real* vp=v.data(); 00428 int c=0; 00429 for (int b=0;b<symbols_offset;b++) 00430 { 00431 unsigned char byte=encoded_row[b]; 00432 for (int j=0;j<8 && c<n_bits;j++,c++) 00433 { 00434 int bit = byte & 1; 00435 byte >>= 1; // shift right once 00436 if(bit) 00437 dot_product += vp[c]; 00438 } 00439 } 00440 for (int b=0;b<n_symbols;b++) 00441 { 00442 int byte = encoded_row[symbols_offset+b]; 00443 if (one_hot_encoding) 00444 { 00445 int n=n_symbol_values[b]; 00446 dot_product += vp[c+byte]; 00447 c += n; 00448 } 00449 else 00450 dot_product += vp[c++]*byte; 00451 } 00452 // WARNING: COULD THIS CAUSE PROBLEMS IF fixedpoint_offset IS NOT A MULTIPLE OF 4 00453 // ON SOME MACHINES? 00454 unsigned char* fixed_point_numbers = &encoded_row[fixedpoint_offset]; 00455 for (int j=0;j<n_fixedpoint-n_last && c<v.length();j++,c++) 00456 { 00457 unsigned char *uc = &fixed_point_numbers[2*j]; 00458 short_and_twobytes u; 00459 u.twobytes[0]=uc[0]; 00460 u.twobytes[1]=uc[1]; 00461 real decoded = u.us*delta[j]+fixedpoint_min[j]; 00462 // correct rounding errors for integers, due to fixed-point low precision 00463 real rounded_decoded = rint(decoded); 00464 if (fabs(rounded_decoded-decoded)<1e-4) 00465 decoded = rounded_decoded; 00466 dot_product += vp[c] * decoded; 00467 } 00468 00469 // Very Slow SANITY CHECK 00470 #ifdef SANITYCHECK_CompactVMatrix 00471 Vec v_i(v.length()); 00472 getRow(i,v_i); 00473 real dot_product2 = PLearn::dot(v_i.subVec(0,v.length()),v); 00474 real diff = fabs(dot_product-dot_product2)/fabs(dot_product2); 00475 if(diff>SANITYCHECK_CompactVMatrix_PRECISION) 00476 PLERROR("IN CompactVMatrix::dot(int i=%d, v) SANITY CHECK FAILED: difference=%g",i,diff); 00477 #endif 00478 00479 return dot_product; 00480 } 00481 00482 00483 real CompactVMatrix::dotProduct(int i, int j) const 00484 { return dot(i,j,width()-n_last); } 00485 00486 real CompactVMatrix::squareDifference(int i, int j) 00487 { 00488 if (row_norms.length()==0) 00489 row_norms = Vec(length_,-1.0); 00490 real normi = row_norms[i]; 00491 if (normi<0) normi=row_norms[i]=dotProduct(i,i); 00492 real normj = row_norms[j]; 00493 if (normj<0) normj=row_norms[j]=dotProduct(j,j); 00494 return normi + normj - 2 * dotProduct(i,j); 00495 } 00496 00497 void CompactVMatrix::encodeAndPutRow(int i, Vec v) 00498 { 00499 unsigned char* encoded_row = &data.data[i*row_n_bytes]; 00500 real* vp=v.data(); 00501 int* perm=variables_permutation.data(); 00502 int c=0; 00503 // 1 vector element ==> 1 bit 00504 for (int b=0;b<symbols_offset;b++) 00505 { 00506 unsigned char byte=0; 00507 for (int j=0;j<8 && c<n_bits;j++,c++) 00508 byte |= int(vp[perm[c]]) << j; // shift to right bit position 00509 encoded_row[b]=byte; 00510 } 00511 // 1 vector element (integer between 0 and n-1) ==> 1 byte 00512 for (int b=0;b<n_symbols;b++,c++) 00513 { 00514 real val = vp[perm[c]]; 00515 int s = int(val); 00516 if (s!=val) 00517 PLERROR("CompactVMatrix::encodeAndPutRow(%d,v): v[%d]=%g not an integer", 00518 i,int(perm[c]),val); 00519 encoded_row[symbols_offset+b] = s; // ASSUMES THAT v IS NOT ONE-HOT ENCODED 00520 if (s<0 || s>=n_symbol_values[b]) 00521 PLERROR("CompactVMatrix::encodeAndPutRow(%d,v): v[%d]=%d not in expected range (0,%d)", 00522 i,int(perm[c]),s,n_symbol_values[b]-1); 00523 } 00524 // WARNING: COULD THIS CAUSE PROBLEMS IF fixedpoint_offset IS NOT A MULTIPLE OF 4 00525 // ON SOME MACHINES? 00526 unsigned short* fixed_point_numbers = (unsigned short*)&encoded_row[fixedpoint_offset]; 00527 for (int j=0;j<n_fixedpoint;j++,c++) 00528 fixed_point_numbers[j]=(unsigned short)((vp[perm[c]]-fixedpoint_min[j])/delta[j]); 00529 } 00530 00531 void CompactVMatrix::putRow(int i, Vec v) 00532 { 00533 putSubRow(i,0,v); 00534 } 00535 00536 void CompactVMatrix::putSubRow(int i, int j, Vec v) 00537 { 00538 unsigned char* encoded_row = &data.data[i*row_n_bytes]; 00539 real* vp=v.data(); 00540 int c=0; 00541 // 1 vector element ==> 1 bit 00542 for (int b=0;b<symbols_offset;b++) 00543 { 00544 unsigned char byte=0; 00545 for (int k=0;k<8 && c<n_bits;k++,c++) 00546 if (c>=j) 00547 byte |= int(vp[c-j]) << k; // shift to right bit position 00548 encoded_row[b]=byte; 00549 } 00550 // if (one_hot_encoding) 00551 // n vector elements in one-hot-code ==> 1 byte 00552 // else 00553 // 1 vector element (integer between 0 and n-1) ==> 1 byte 00554 int n=0; 00555 if (one_hot_encoding) 00556 for (int b=0;b<n_symbols;b++,c+=n) 00557 { 00558 n=n_symbol_values[b]; 00559 if (c>=j) 00560 { 00561 int pos=-1; 00562 for (int k=0;k<n;k++) 00563 { 00564 real vk=vp[c+k-j]; 00565 if (vk!=0 && vk!=1) 00566 PLERROR("CompactVMatrix::putRow(%d,v): v[%d]=%g!=0 or 1 (not one-hot-code)", 00567 i,c,vk); 00568 if (vk==1) 00569 { 00570 if (pos<0) pos=k; 00571 else PLERROR("CompactVMatrix::putRow(%d,v): %d-th symbol not one-hot-encoded", 00572 i,b); 00573 } 00574 } 00575 if (pos<0) 00576 PLERROR("CompactVMatrix::putRow(%d,v): %d-th symbol not one-hot-encoded", 00577 i,b); 00578 encoded_row[symbols_offset+b] = pos; 00579 } 00580 } 00581 else 00582 for (int b=0;b<n_symbols;b++,c++) 00583 if (c>=j) 00584 { 00585 real val = vp[c-j]; 00586 int s = int(val); 00587 if (s!=val) 00588 PLERROR("CompactVMatrix::encodeAndPutRow(%d,v): v[%d]=%g not an integer", 00589 i,c,val); 00590 encoded_row[symbols_offset+b] = s; // ASSUMES THAT v IS NOT ONE-HOT ENCODED 00591 if (s<0 || s>=n_symbol_values[b]) 00592 PLERROR("CompactVMatrix::encodeAndPutRow(%d,v): v[%d]=%d not in expected range (0,%d)", 00593 i,c,s,n_symbol_values[b]-1); 00594 } 00595 00596 // 1 vector element (real betweeen fixedpoint_min and fixedpoint_max) ==> 2 bytes 00597 // 00598 // WARNING: COULD THIS CAUSE PROBLEMS IF fixedpoint_offset IS NOT A MULTIPLE OF 4 00599 // ON SOME MACHINES? 00600 unsigned short* fixed_point_numbers = (unsigned short*)&encoded_row[fixedpoint_offset]; 00601 for (int k=0;k<n_fixedpoint;k++,c++) 00602 if (c>=j) 00603 fixed_point_numbers[k]=(unsigned short)((vp[c-j]-fixedpoint_min[k])/delta[k]); 00604 } 00605 00606 void CompactVMatrix::perturb(int i, Vec v, real noise_level, int n_last) 00607 { 00608 #ifdef BOUNDCHECK 00609 if (i<0 || i>=length_) 00610 PLERROR("CompactVMatrix::perturb, row %d out of bounds [0,%d]",i,length_-1); 00611 if (v.length()!=width_) 00612 PLERROR("CompactVMatrix::perturb, length of v (%d) should be equal to width of VMat (%d)",v.length(),width()); 00613 #endif 00614 if (fieldstats.size()!=n_variables) 00615 PLERROR("CompactVMatrix::perturb: stats not computed or wrong size"); 00616 if (noise_level<0 || noise_level>1) 00617 PLERROR("CompactVMatrix::perturb: noise_level=%g, should be in [0,1]",noise_level); 00618 00619 unsigned char* encoded_row = &data.data[i*row_n_bytes]; 00620 real* vp=v.data(); 00621 int c=0; 00622 int var=0; 00623 Vec probs(width_); 00624 for (int b=0;b<symbols_offset;b++) 00625 { 00626 unsigned char byte=encoded_row[b]; 00627 for (int j=0;j<8 && c<n_bits;j++,c++,var++) 00628 { 00629 int bit = byte & 1; 00630 byte >>= 1; // shift right once 00631 vp[c]=binomial_sample((1-noise_level)*bit+noise_level*fieldstats[var].prob(1)); 00632 } 00633 } 00634 for (int b=0;b<n_symbols;b++,var++) 00635 { 00636 int byte = encoded_row[symbols_offset+b]; 00637 int nv=n_symbol_values[b]; 00638 probs.resize(nv); 00639 VMFieldStat& stat=fieldstats[var]; 00640 for (int val=0;val<nv;val++) 00641 if (val==byte) 00642 probs[val]=(1-noise_level)+noise_level*stat.prob(val); 00643 else 00644 probs[val]=noise_level*stat.prob(val); 00645 byte = multinomial_sample(probs); 00646 if (one_hot_encoding) 00647 { 00648 int n=n_symbol_values[b]; 00649 for (int j=0;j<n;j++) vp[c+j]=0; 00650 vp[c+byte]=1; 00651 c+=n; 00652 } 00653 else vp[c++]=byte; 00654 } 00655 unsigned char* fixed_point_numbers = &encoded_row[fixedpoint_offset]; 00656 for (int j=0;j<n_fixedpoint;j++,c++,var++) 00657 { 00658 unsigned char *uc = &fixed_point_numbers[2*j]; 00659 short_and_twobytes u; 00660 u.twobytes[0]=uc[0]; 00661 u.twobytes[1]=uc[1]; 00662 real decoded = u.us*delta[j]+fixedpoint_min[j]; 00663 // correct rounding errors for integers, due to fixed-point low precision 00664 real rounded_decoded = rint(decoded); 00665 if (fabs(rounded_decoded-decoded)<1e-4) 00666 decoded = rounded_decoded; 00667 if (var<n_variables-n_last) 00668 { 00669 int ntry=0; 00670 do 00671 { 00672 vp[c]=decoded+noise_level*fieldstats[var].stddev()*normal_sample(); 00673 ntry++; 00674 if (ntry>=100) 00675 PLERROR("CompactVMatrix::perturb:Something wrong in resampling, tried 100 times"); 00676 } 00677 while (vp[c]<fixedpoint_min[j] || vp[c]>fixedpoint_max[j]); 00678 } 00679 else 00680 vp[c]=decoded; 00681 } 00682 } 00683 /* 00684 void CompactVMatrix::write(ostream& out) const 00685 { 00686 writeHeader(out,"CompactVMatrix"); 00687 writeField(out,"length",length_); 00688 writeField(out,"width",normal_width); 00689 writeField(out,"fieldinfos",fieldinfos); 00690 writeField(out,"fieldstats",fieldstats); 00691 writeField(out,"row_n_bytes",row_n_bytes); 00692 writeField(out,"n_bits",n_bits); 00693 writeField(out,"n_symbols",n_symbols); 00694 writeField(out,"n_fixedpoint",n_fixedpoint); 00695 writeField(out,"one_hot_encoding",one_hot_encoding); 00696 writeField(out,"n_symbol_values",n_symbol_values); 00697 writeField(out,"fixedpoint_min",fixedpoint_min); 00698 writeField(out,"fixedpoint_max",fixedpoint_max); 00699 writeField(out,"delta",delta); 00700 writeField(out,"variables_permutation",variables_permutation); 00701 writeField(out,"symbols_offset",symbols_offset); 00702 writeField(out,"fixedpoint_offset",fixedpoint_offset); 00703 out.write((char*)data.data,data.length()*sizeof(unsigned char)); 00704 writeFooter(out,"CompactVMatrix"); 00705 } 00706 00707 void CompactVMatrix::oldread(istream& in) 00708 { 00709 readHeader(in,"CompactVMatrix"); 00710 readField(in,"length",length_); 00711 readField(in,"width",normal_width); 00712 readField(in,"fieldinfos",fieldinfos); 00713 fieldinfos.resize(0); // to fix current bug in setting fieldinfos 00714 readField(in,"fieldstats",fieldstats); 00715 readField(in,"row_n_bytes",row_n_bytes); 00716 readField(in,"n_bits",n_bits); 00717 readField(in,"n_symbols",n_symbols); 00718 readField(in,"n_fixedpoint",n_fixedpoint); 00719 n_variables = n_bits + n_symbols + n_fixedpoint; 00720 readField(in,"one_hot_encoding",one_hot_encoding); 00721 setOneHotMode(one_hot_encoding); 00722 readField(in,"n_symbol_values",n_symbol_values); 00723 readField(in,"fixedpoint_min",fixedpoint_min); 00724 readField(in,"fixedpoint_max",fixedpoint_max); 00725 readField(in,"delta",delta); 00726 readField(in,"variables_permutation",variables_permutation); 00727 readField(in,"symbols_offset",symbols_offset); 00728 readField(in,"fixedpoint_offset",fixedpoint_offset); 00729 data.resize(row_n_bytes*length_); 00730 in.read((char*)data.data,data.length()*sizeof(unsigned char)); 00731 readFooter(in,"CompactVMatrix"); 00732 } 00733 */ 00734 void CompactVMatrix::append(CompactVMatrix* vm) 00735 { 00736 if (width_!=vm->width()) 00737 PLERROR("CompactVMatrix::append, incompatible width %d vs %d", 00738 width_,vm->width()); 00739 if (row_n_bytes!=vm->row_n_bytes) 00740 PLERROR("CompactVMatrix::append, incompatible row_n_bytes %d vs %d", 00741 row_n_bytes,vm->row_n_bytes); 00742 if (n_bits!=vm->n_bits) 00743 PLERROR("CompactVMatrix::append, incompatible n_bits %d vs %d", 00744 n_bits,vm->n_bits); 00745 if (n_symbols!=vm->n_symbols) 00746 PLERROR("CompactVMatrix::append, incompatible n_symbols %d vs %d", 00747 n_symbols,vm->n_symbols); 00748 if (n_fixedpoint!=vm->n_fixedpoint) 00749 PLERROR("CompactVMatrix::append, incompatible n_fixedpoint %d vs %d", 00750 n_fixedpoint,vm->n_fixedpoint); 00751 if (n_symbol_values!=vm->n_symbol_values) 00752 { 00753 //n_symbol_values.write(cerr); cerr << endl; 00754 //vm->n_symbol_values.write(cerr); cerr << endl; 00755 PLearn::write(cerr, n_symbol_values); 00756 cerr << endl; 00757 PLearn::write(cerr, vm->n_symbol_values); 00758 cerr << endl; 00759 PLERROR("CompactVMatrix::append, incompatible n_symbol_values"); 00760 } 00761 bool rescale = false; 00762 for (int j=0;j<n_fixedpoint && !rescale;j++) 00763 if (fixedpoint_min[j]>vm->fixedpoint_min[j] || 00764 fixedpoint_max[j]<vm->fixedpoint_max[j]) rescale=true; 00765 if (rescale) 00766 { 00767 cout << "The appended VMat has intervals that are wider than the current one." << endl; 00768 cout << "Start rescaling numeric variables fixed point representation." << endl; 00769 Vec new_min = fixedpoint_min.copy(); 00770 Vec new_max = fixedpoint_max.copy(); 00771 Vec new_delta = delta.copy(); 00772 TVec<bool> change(n_fixedpoint); 00773 for (int j=0;j<n_fixedpoint;j++) 00774 { 00775 change[j]=false; 00776 if (fixedpoint_min[j]>vm->fixedpoint_min[j]) 00777 { 00778 change[j]=true; 00779 new_min[j]=vm->fixedpoint_min[j]; 00780 } 00781 if (fixedpoint_max[j]<vm->fixedpoint_max[j]) 00782 { 00783 change[j]=true; 00784 new_max[j]=vm->fixedpoint_max[j]; 00785 } 00786 if (change[j]) 00787 new_delta[j]=(new_max[j]-new_min[j])/USHRT_MAX; 00788 } 00789 for (int r=0;r<length_;r++) 00790 { 00791 unsigned char* encoded_row = &data.data[r*row_n_bytes]; 00792 unsigned char* fixed_point_numbers = &encoded_row[fixedpoint_offset]; 00793 for (int j=0;j<n_fixedpoint;j++) 00794 if (change[j]) 00795 { 00796 // DECODE using previous min/max 00797 unsigned char *uc = &fixed_point_numbers[2*j]; 00798 short_and_twobytes u; 00799 u.twobytes[0]=uc[0]; 00800 u.twobytes[1]=uc[1]; 00801 real decoded = u.us*delta[j]+fixedpoint_min[j]; 00802 // correct rounding errors for integers, due to fixed-point low precision 00803 real rounded_decoded = rint(decoded); 00804 if (fabs(rounded_decoded-decoded)<1e-4) 00805 decoded = rounded_decoded; 00806 // ENCODE using new min/max 00807 fixed_point_numbers[j]=(unsigned char)((decoded-new_min[j])/new_delta[j]); 00808 } 00809 } 00810 cout << "DONE rescaling numeric variables fixed point representation." << endl; 00811 fixedpoint_min << new_min; 00812 fixedpoint_max << new_max; 00813 delta << new_delta; 00814 } 00815 int new_length=length_+vm->length(); 00816 data.resize(row_n_bytes*new_length); 00817 // copy the new data 00818 Vec row(width_); 00819 bool old_vm_encoding = vm->one_hot_encoding; 00820 bool old_encoding = one_hot_encoding; 00821 vm->one_hot_encoding=false; 00822 setOneHotMode(false); 00823 int old_length=length_; 00824 length_=new_length; 00825 for (int r=0;r<vm->length();r++) 00826 { 00827 vm->getRow(r,row); 00828 putRow(old_length+r,row); 00829 } 00830 vm->one_hot_encoding=old_vm_encoding; 00831 setOneHotMode(old_encoding); 00832 } 00833 00834 void CompactVMatrix::makeDeepCopyFromShallowCopy(map<const void*, void*>& copies) 00835 { 00836 deepCopyField(data, copies); 00837 deepCopyField(n_symbol_values, copies); 00838 deepCopyField(fixedpoint_min, copies); 00839 deepCopyField(fixedpoint_max, copies); 00840 deepCopyField(variables_permutation, copies); 00841 } 00842 00843 } // end of namespace PLearn 00844