00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
#include "CompactVMatrix.h"
00042
#include <plearn/math/TMat_maths.h>
00043
#include <plearn/math/random.h>
00044
00045
namespace PLearn {
00046
using namespace std;
00047
00048 union short_and_twobytes
00049 {
00050 unsigned short us;
00051 unsigned char twobytes[2];
00052 };
00053
00056
00057 unsigned char CompactVMatrix::n_bits_in_byte[256];
00058
00059 void CompactVMatrix::set_n_bits_in_byte()
00060 {
00061
if (
n_bits_in_byte[255]!=8)
00062 {
00063
for (
int i=0;i<256;i++)
00064 {
00065
int n=0;
00066
unsigned char byte=i;
00067
for (
int j=0;j<8;j++)
00068 {
00069 n += byte & 1;
00070 byte >>= 1;
00071 }
00072
n_bits_in_byte[i]=n;
00073 }
00074 }
00075 }
00076
00077
PLEARN_IMPLEMENT_OBJECT(
CompactVMatrix,
"ONE LINE DESCR",
"NO HELP");
00078
00079 CompactVMatrix::CompactVMatrix()
00080 : n_symbols(0), n_fixedpoint(0), n_variables(0), one_hot_encoding(true), n_symbol_values(0),
00081 fixedpoint_min(0), fixedpoint_max(0), delta(0), variables_permutation(0)
00082 {
00083 }
00084
00085 CompactVMatrix::CompactVMatrix(
int the_length,
int nvariables,
int n_binary,
00086
int n_nonbinary_discrete,
00087
int n_fixed_point,
TVec<int>& n_symbolvalues,
00088
Vec& fixed_point_min,
Vec& fixed_point_max,
00089
bool onehot_encoding)
00090 :
inherited(the_length,nvariables), n_bits(n_binary),
00091 n_symbols(n_nonbinary_discrete), n_fixedpoint(n_fixed_point),
00092 n_variables(nvariables), one_hot_encoding(onehot_encoding),
00093 n_symbol_values(n_symbolvalues),
00094 fixedpoint_min(fixed_point_min), fixedpoint_max(fixed_point_max),
00095 delta(n_fixed_point), variables_permutation(n_variables)
00096 {
00097
normal_width=
n_bits+n_fixed_point;
00098
for (
int i=0;i<
n_symbols;i++)
00099
normal_width +=
n_symbol_values[i];
00100
setOneHotMode(
one_hot_encoding);
00101
for (
int i=0;i<
n_variables;i++)
variables_permutation[i]=i;
00102
for (
int i=0;i<n_symbols;i++)
00103
delta[i]=(
fixedpoint_max[i]-
fixedpoint_min[i])/USHRT_MAX;
00104
symbols_offset = (
int)ceil(
n_bits/8.0);
00105
fixedpoint_offset =
symbols_offset + n_symbols;
00106
row_n_bytes =
fixedpoint_offset +
sizeof(
unsigned short)*n_fixed_point;
00107
data.
resize(length_ *
row_n_bytes);
00108
set_n_bits_in_byte();
00109 }
00110
00111 CompactVMatrix::CompactVMatrix(
VMat m,
int keep_last_variables_last,
bool onehot_encoding)
00112 :
inherited(m->length(),m->width()), one_hot_encoding(onehot_encoding),
00113 n_symbol_values(m->width()), variables_permutation(m->width())
00114 {
00115
if (!m->hasStats())
00116 {
00117 cout <<
"CompactVMatrix(VMat, int): VMat did not have stats. Computing them." <<
endl;
00118 m->computeStats();
00119 }
00120
00121
n_bits =
n_symbols =
n_fixedpoint = 0;
00122
TVec<int> bits_position(m->
width());
00123
TVec<int> symbols_position(m->
width());
00124
TVec<int> fp_position(m->
width());
00125
fixedpoint_min.
resize(m->
width());
00126
fixedpoint_max.
resize(m->
width());
00127
delta.
resize(m->
width());
00128
for (
int i=0;i<m->
width();i++)
00129 {
00130
VMFieldStat& stat = m->fieldstats[i];
00131
int n_values = (
int)stat.
counts.size();
00132
bool counts_look_continuous = !
isMapKeysAreInt(stat.
counts);
00133
if (n_values == 0 || counts_look_continuous || i>=m->
width()-keep_last_variables_last)
00134 {
00135
fixedpoint_min[n_fixedpoint]=stat.
min();
00136
fixedpoint_max[n_fixedpoint]=stat.
max();
00137
delta[n_fixedpoint]=(stat.
max()-stat.
min())/USHRT_MAX;
00138 fp_position[n_fixedpoint++]=i;
00139 }
00140
else
00141 {
00142
if (stat.
min()!=0 || (stat.
max()-stat.
min()+1)!=stat.
counts.size())
00143
PLERROR(
"CompactVMatrix:: variable %d looks discrete but has zero-frequency intermediate values or min!=0",i);
00144
if (n_values==2)
00145 bits_position[
n_bits++]=i;
00146
else if (n_values<=256)
00147 {
00148 symbols_position[n_symbols]=i;
00149
n_symbol_values[n_symbols++] = n_values;
00150 }
00151
else
00152 {
00153
fixedpoint_min[n_fixedpoint]=stat.
min();
00154
fixedpoint_max[n_fixedpoint]=stat.
max();
00155
delta[n_fixedpoint]=(stat.
max()-stat.
min())/USHRT_MAX;
00156 fp_position[n_fixedpoint++]=i;
00157 }
00158 }
00159 }
00160
fixedpoint_min.
resize(n_fixedpoint);
00161
fixedpoint_max.
resize(n_fixedpoint);
00162
delta.
resize(n_fixedpoint);
00163
n_symbol_values.
resize(n_symbols);
00164
n_variables =
n_bits + n_symbols + n_fixedpoint;
00165
int j=0;
00166
for (
int i=0;i<
n_bits;i++,j++)
00167
variables_permutation[j]=bits_position[i];
00168
for (
int i=0;i<n_symbols;i++,j++)
00169
variables_permutation[j]=symbols_position[i];
00170
for (
int i=0;i<n_fixedpoint;i++,j++)
00171
variables_permutation[j]=fp_position[i];
00172
00173
normal_width=n_bits+n_fixedpoint;
00174
for (
int i=0;i<n_symbols;i++)
00175
normal_width +=
n_symbol_values[i];
00176
setOneHotMode(
one_hot_encoding);
00177
symbols_offset = (
int)ceil(n_bits/8.0);
00178
fixedpoint_offset =
symbols_offset + n_symbols;
00179
row_n_bytes =
fixedpoint_offset +
sizeof(
unsigned short)*n_fixedpoint;
00180
data.
resize(length_ *
row_n_bytes);
00181
00182
00183
00184
if (!
one_hot_encoding)
00185 {
00186 fieldinfos.
resize(width_);
00187 fieldstats.
resize(width_);
00188
for (
int i=0;i<width_;i++)
00189 {
00190 fieldinfos[i]=m->getFieldInfos()[
variables_permutation[i]];
00191 fieldstats[i]=m->fieldstats[
variables_permutation[i]];
00192 }
00193 }
00194
else
00195 {
00196 fieldinfos.
resize(0);
00197 fieldstats.
resize(0);
00198 }
00199
00200
00201
Vec mrow(m->
width());
00202
for (
int t=0;t<length_;t++)
00203 {
00204 m->getRow(t,mrow);
00205
encodeAndPutRow(t,mrow);
00206 }
00207
set_n_bits_in_byte();
00208 }
00209
00210
00211 CompactVMatrix::CompactVMatrix(
const string& filename,
int nlast) :
RowBufferedVMatrix(0,0)
00212 {
00213
load(filename);
00214
n_last=nlast;
00215
set_n_bits_in_byte();
00216 }
00217
00218 CompactVMatrix::CompactVMatrix(
CompactVMatrix* cvm,
VMat m,
bool rescale,
bool check)
00219 :
inherited(m->length(),m->width())
00220 {
00221
if (cvm->
width() != m->
width())
00222
PLERROR(
"CompactVMatrix::CompactVMatrix(CompactVMatrix* cvm, VMat m,...), args have widths %d!=%d",
00223 cvm->
width(),m->
width());
00224
00225
row_n_bytes = cvm->
row_n_bytes;
00226
data.
resize(length_*
row_n_bytes);
00227
n_bits = cvm->
n_bits;
00228
n_symbols = cvm->
n_symbols;
00229
n_fixedpoint = cvm->
n_fixedpoint;
00230
n_variables = cvm->
n_variables;
00231
n_symbol_values = cvm->
n_symbol_values;
00232
fixedpoint_min = cvm->
fixedpoint_min.
copy();
00233
fixedpoint_max = cvm->
fixedpoint_max.
copy();
00234
delta = cvm->
delta.
copy();
00235
variables_permutation = cvm->
variables_permutation;
00236
n_last = cvm->
n_last;
00237
normal_width = cvm->
normal_width;
00238
symbols_offset = cvm->
symbols_offset;
00239
fixedpoint_offset = cvm->
fixedpoint_offset;
00240
00241
setOneHotMode(cvm->
one_hot_encoding);
00242
Vec row(width_);
00243
int offs=width_-
n_fixedpoint;
00244
if (rescale)
00245 {
00246
for (
int i=0;i<length_;i++)
00247 {
00248 m->getRow(i,row);
00249
for (
int j=0;j<n_fixedpoint;j++)
00250 {
00251
real element=row[offs+j];
00252
if (element<
fixedpoint_min[j])
00253 fixedpoint_min[j]=element;
00254
if (element>
fixedpoint_max[j])
00255 fixedpoint_max[j]=element;
00256 }
00257 }
00258
for (
int j=0;j<n_fixedpoint;j++)
00259
delta[j]=(
fixedpoint_max[j]-
fixedpoint_min[j])/USHRT_MAX;
00260 }
00261
for (
int i=0;i<length_;i++)
00262 {
00263 m->getRow(i,row);
00264
if (!rescale && check)
00265 {
00266
for (
int j=0;j<n_fixedpoint;j++)
00267 {
00268
real element=row[offs+j];
00269
if (element<
fixedpoint_min[j] ||
00270 element>
fixedpoint_max[j])
00271
PLERROR(
"CompactVMatrix::CompactVMatrix(CompactVMatrix* cvm, VMat m,...) out-of-range element(%d,%d)=%g not in [%g,%g]",
00272 i,j,element,fixedpoint_min[j],fixedpoint_max[j]);
00273 }
00274 }
00275
putRow(i,row);
00276 }
00277 }
00278
00279 void CompactVMatrix::setOneHotMode(
bool on)
00280 {
00281
one_hot_encoding=on;
00282
if (
one_hot_encoding)
00283 width_ =
normal_width;
00284
else
00285 width_ =
n_variables;
00286 }
00287
00288 void CompactVMatrix::getNewRow(
int i,
const Vec& v)
const
00289
{
00290
#ifdef BOUNDCHECK
00291
if (i<0 || i>=length_)
00292
PLERROR(
"CompactVMatrix::getNewRow, row %d out of bounds [0,%d]",i,length_-1);
00293
if (v.
length()!=width_)
00294
PLERROR(
"CompactVMatrix::getNewRow, length of v (%d) should be equal to width of VMat (%d)",v.
length(),
width());
00295
#endif
00296
unsigned char* encoded_row = &
data.
data[i*
row_n_bytes];
00297
real* vp=v.
data();
00298
int c=0;
00299
for (
int b=0;b<
symbols_offset;b++)
00300 {
00301
unsigned char byte=encoded_row[b];
00302
for (
int j=0;j<8 && c<
n_bits;j++,c++)
00303 {
00304
int bit = byte & 1;
00305 byte >>= 1;
00306 vp[c]=bit;
00307 }
00308 }
00309
for (
int b=0;b<
n_symbols;b++)
00310 {
00311
int byte = encoded_row[symbols_offset+b];
00312
if (
one_hot_encoding)
00313 {
00314
int n=
n_symbol_values[b];
00315
for (
int j=0;j<n;j++) vp[c+j]=0;
00316 vp[c+byte]=1;
00317 c+=n;
00318 }
00319
else vp[c++]=byte;
00320 }
00321
unsigned char* fixed_point_numbers = &encoded_row[
fixedpoint_offset];
00322
for (
int j=0;j<
n_fixedpoint;j++,c++)
00323 {
00324
unsigned char *uc = &fixed_point_numbers[2*j];
00325
short_and_twobytes u;
00326 u.
twobytes[0]=uc[0];
00327 u.
twobytes[1]=uc[1];
00328
real decoded = u.
us*
delta[j]+
fixedpoint_min[j];
00329
00330
real rounded_decoded = rint(decoded);
00331
if (fabs(rounded_decoded-decoded)<1e-4)
00332 decoded = rounded_decoded;
00333 vp[c]=decoded;
00334 }
00335 }
00336
00337
00338 #define SANITYCHECK_CompactVMatrix_PRECISION 1e-5
00339
00340 real CompactVMatrix::dot(
int i,
int j,
int inputsize)
const
00341
{
00342
if(inputsize!=
width()-
n_last)
00343
PLERROR(
"In CompactVMatrix::dot, in current implementation inputsize must be equal to width()-n_last");
00344
00345
unsigned char* encoded_row_i = &
data.
data[i*
row_n_bytes];
00346
unsigned char* encoded_row_j = &
data.
data[j*
row_n_bytes];
00347
real dot_product=0.;
00348
int c=0;
00349
for (
int b=0;b<
symbols_offset;b++)
00350 {
00351
unsigned char byte_i=encoded_row_i[b];
00352
unsigned char byte_j=encoded_row_j[b];
00353
unsigned char byte_and = byte_i & byte_j;
00354
#ifdef SANITYCHECK_CompactVMatrix
00355
real check=
dot_product;
00356
#endif
00357
00358
00359
00360
dot_product +=
n_bits_in_byte[byte_and];
00361
#ifdef SANITYCHECK_CompactVMatrix
00362
for (
int j=0;j<8 && c<
n_bits;j++,c++)
00363 {
00364 check += byte_and & 1;
00365 byte_and >>= 1;
00366 }
00367
if (check!=
dot_product)
00368
PLERROR(
"logic error in n_bits_in_byte");
00369
#else
00370
c+=8;
00371
if (c>n_bits) c=n_bits;
00372
#endif
00373
}
00374
if (c>width_-
n_last)
00375
PLERROR(
"CompactVMatrix: n_last should be among discrete non-binary or continuous variables");
00376
for (
int b=0;b<
n_symbols && c<width_-
n_last;b++)
00377 {
00378
int byte_i = encoded_row_i[symbols_offset+b];
00379
int byte_j = encoded_row_j[symbols_offset+b];
00380
if (byte_i==byte_j)
dot_product++;
00381
if (
one_hot_encoding)
00382 c+=
n_symbol_values[b];
00383
else
00384 c++;
00385 }
00386
unsigned char* fixed_point_numbers_i = &encoded_row_i[
fixedpoint_offset];
00387
unsigned char* fixed_point_numbers_j = &encoded_row_j[
fixedpoint_offset];
00388
for (
int k=0;
k<
n_fixedpoint-n_last && c<width_-n_last;
k++,c++)
00389 {
00390
unsigned char *uc = &fixed_point_numbers_i[2*
k];
00391
short_and_twobytes u;
00392 u.
twobytes[0]=uc[0];
00393 u.
twobytes[1]=uc[1];
00394
real decoded_i = u.
us*
delta[
k]+
fixedpoint_min[
k];
00395 uc = &fixed_point_numbers_j[2*
k];
00396 u.
twobytes[0]=uc[0];
00397 u.
twobytes[1]=uc[1];
00398
real decoded_j = u.
us*
delta[
k]+fixedpoint_min[
k];
00399
#ifdef SANITYCHECK_CompactVMatrix
00400
real rounded_decoded_i = rint(decoded_i);
00401
if (fabs(rounded_decoded_i-decoded_i)<1e-4)
00402 decoded_i = rounded_decoded_i;
00403
real rounded_decoded_j = rint(decoded_j);
00404
if (fabs(rounded_decoded_j-decoded_j)<1e-4)
00405 decoded_j = rounded_decoded_j;
00406
#endif
00407
dot_product += decoded_i * decoded_j;
00408 }
00409
00410
return dot_product;
00411 }
00412
00413
00414 real CompactVMatrix::dot(
int i,
const Vec& v)
const
00415
{
00416
#ifdef BOUNDCHECK
00417
if (i<0 || i>=length_)
00418
PLERROR(
"CompactVMatrix::dot, row %d out of bounds [0,%d]",i,length_-1);
00419
#endif
00420
00421
if(v.
length()!=
width()-
n_last)
00422
PLERROR(
"In CompactVMatrix::dot, in current implementation v.length() must be equal to width()-n_last");
00423
00424
real dot_product = 0.;
00425
00426
unsigned char* encoded_row = &
data.
data[i*
row_n_bytes];
00427
real* vp=v.
data();
00428
int c=0;
00429
for (
int b=0;b<
symbols_offset;b++)
00430 {
00431
unsigned char byte=encoded_row[b];
00432
for (
int j=0;j<8 && c<
n_bits;j++,c++)
00433 {
00434
int bit = byte & 1;
00435 byte >>= 1;
00436
if(bit)
00437
dot_product += vp[c];
00438 }
00439 }
00440
for (
int b=0;b<
n_symbols;b++)
00441 {
00442
int byte = encoded_row[symbols_offset+b];
00443
if (
one_hot_encoding)
00444 {
00445
int n=
n_symbol_values[b];
00446
dot_product += vp[c+byte];
00447 c += n;
00448 }
00449
else
00450
dot_product += vp[c++]*byte;
00451 }
00452
00453
00454
unsigned char* fixed_point_numbers = &encoded_row[
fixedpoint_offset];
00455
for (
int j=0;j<
n_fixedpoint-
n_last && c<v.
length();j++,c++)
00456 {
00457
unsigned char *uc = &fixed_point_numbers[2*j];
00458
short_and_twobytes u;
00459 u.
twobytes[0]=uc[0];
00460 u.
twobytes[1]=uc[1];
00461
real decoded = u.
us*
delta[j]+
fixedpoint_min[j];
00462
00463
real rounded_decoded = rint(decoded);
00464
if (fabs(rounded_decoded-decoded)<1e-4)
00465 decoded = rounded_decoded;
00466
dot_product += vp[c] * decoded;
00467 }
00468
00469
00470
#ifdef SANITYCHECK_CompactVMatrix
00471
Vec v_i(v.
length());
00472 getRow(i,v_i);
00473
real dot_product2 =
PLearn::dot(v_i.
subVec(0,v.
length()),v);
00474
real diff = fabs(
dot_product-dot_product2)/fabs(dot_product2);
00475
if(diff>
SANITYCHECK_CompactVMatrix_PRECISION)
00476
PLERROR(
"IN CompactVMatrix::dot(int i=%d, v) SANITY CHECK FAILED: difference=%g",i,diff);
00477
#endif
00478
00479
return dot_product;
00480 }
00481
00482
00483 real CompactVMatrix::dotProduct(
int i,
int j)
const
00484
{
return dot(i,j,
width()-
n_last); }
00485
00486 real CompactVMatrix::squareDifference(
int i,
int j)
00487 {
00488
if (
row_norms.
length()==0)
00489
row_norms =
Vec(length_,-1.0);
00490
real normi =
row_norms[i];
00491
if (normi<0) normi=row_norms[i]=
dotProduct(i,i);
00492
real normj = row_norms[j];
00493
if (normj<0) normj=row_norms[j]=dotProduct(j,j);
00494
return normi + normj - 2 * dotProduct(i,j);
00495 }
00496
00497 void CompactVMatrix::encodeAndPutRow(
int i,
Vec v)
00498 {
00499
unsigned char* encoded_row = &
data.
data[i*
row_n_bytes];
00500
real* vp=v.
data();
00501
int* perm=
variables_permutation.
data();
00502
int c=0;
00503
00504
for (
int b=0;b<
symbols_offset;b++)
00505 {
00506
unsigned char byte=0;
00507
for (
int j=0;j<8 && c<
n_bits;j++,c++)
00508 byte |=
int(vp[perm[c]]) << j;
00509 encoded_row[b]=byte;
00510 }
00511
00512
for (
int b=0;b<
n_symbols;b++,c++)
00513 {
00514
real val = vp[perm[c]];
00515
int s =
int(
val);
00516
if (s!=
val)
00517
PLERROR(
"CompactVMatrix::encodeAndPutRow(%d,v): v[%d]=%g not an integer",
00518 i,
int(perm[c]),
val);
00519 encoded_row[symbols_offset+b] = s;
00520
if (s<0 || s>=
n_symbol_values[b])
00521
PLERROR(
"CompactVMatrix::encodeAndPutRow(%d,v): v[%d]=%d not in expected range (0,%d)",
00522 i,
int(perm[c]),s,n_symbol_values[b]-1);
00523 }
00524
00525
00526
unsigned short* fixed_point_numbers = (
unsigned short*)&encoded_row[
fixedpoint_offset];
00527
for (
int j=0;j<
n_fixedpoint;j++,c++)
00528 fixed_point_numbers[j]=(
unsigned short)((vp[perm[c]]-
fixedpoint_min[j])/
delta[j]);
00529 }
00530
00531 void CompactVMatrix::putRow(
int i,
Vec v)
00532 {
00533
putSubRow(i,0,v);
00534 }
00535
00536 void CompactVMatrix::putSubRow(
int i,
int j,
Vec v)
00537 {
00538
unsigned char* encoded_row = &
data.
data[i*
row_n_bytes];
00539
real* vp=v.
data();
00540
int c=0;
00541
00542
for (
int b=0;b<
symbols_offset;b++)
00543 {
00544
unsigned char byte=0;
00545
for (
int k=0;
k<8 && c<
n_bits;
k++,c++)
00546
if (c>=j)
00547 byte |=
int(vp[c-j]) <<
k;
00548 encoded_row[b]=byte;
00549 }
00550
00551
00552
00553
00554
int n=0;
00555
if (
one_hot_encoding)
00556
for (
int b=0;b<
n_symbols;b++,c+=n)
00557 {
00558 n=
n_symbol_values[b];
00559
if (c>=j)
00560 {
00561
int pos=-1;
00562
for (
int k=0;
k<n;
k++)
00563 {
00564
real vk=vp[c+
k-j];
00565
if (vk!=0 && vk!=1)
00566
PLERROR(
"CompactVMatrix::putRow(%d,v): v[%d]=%g!=0 or 1 (not one-hot-code)",
00567 i,c,vk);
00568
if (vk==1)
00569 {
00570
if (pos<0) pos=
k;
00571
else PLERROR(
"CompactVMatrix::putRow(%d,v): %d-th symbol not one-hot-encoded",
00572 i,b);
00573 }
00574 }
00575
if (pos<0)
00576
PLERROR(
"CompactVMatrix::putRow(%d,v): %d-th symbol not one-hot-encoded",
00577 i,b);
00578 encoded_row[symbols_offset+b] = pos;
00579 }
00580 }
00581
else
00582
for (
int b=0;b<n_symbols;b++,c++)
00583
if (c>=j)
00584 {
00585
real val = vp[c-j];
00586
int s =
int(
val);
00587
if (s!=
val)
00588
PLERROR(
"CompactVMatrix::encodeAndPutRow(%d,v): v[%d]=%g not an integer",
00589 i,c,
val);
00590 encoded_row[symbols_offset+b] = s;
00591
if (s<0 || s>=
n_symbol_values[b])
00592
PLERROR(
"CompactVMatrix::encodeAndPutRow(%d,v): v[%d]=%d not in expected range (0,%d)",
00593 i,c,s,n_symbol_values[b]-1);
00594 }
00595
00596
00597
00598
00599
00600
unsigned short* fixed_point_numbers = (
unsigned short*)&encoded_row[
fixedpoint_offset];
00601
for (
int k=0;
k<
n_fixedpoint;
k++,c++)
00602
if (c>=j)
00603 fixed_point_numbers[
k]=(
unsigned short)((vp[c-j]-
fixedpoint_min[
k])/
delta[
k]);
00604 }
00605
00606 void CompactVMatrix::perturb(
int i,
Vec v,
real noise_level,
int n_last)
00607 {
00608
#ifdef BOUNDCHECK
00609
if (i<0 || i>=length_)
00610
PLERROR(
"CompactVMatrix::perturb, row %d out of bounds [0,%d]",i,length_-1);
00611
if (v.
length()!=width_)
00612
PLERROR(
"CompactVMatrix::perturb, length of v (%d) should be equal to width of VMat (%d)",v.
length(),
width());
00613
#endif
00614
if (fieldstats.
size()!=
n_variables)
00615
PLERROR(
"CompactVMatrix::perturb: stats not computed or wrong size");
00616
if (noise_level<0 || noise_level>1)
00617
PLERROR(
"CompactVMatrix::perturb: noise_level=%g, should be in [0,1]",noise_level);
00618
00619
unsigned char* encoded_row = &
data.
data[i*
row_n_bytes];
00620
real* vp=v.
data();
00621
int c=0;
00622
int var=0;
00623
Vec probs(width_);
00624
for (
int b=0;b<
symbols_offset;b++)
00625 {
00626
unsigned char byte=encoded_row[b];
00627
for (
int j=0;j<8 && c<
n_bits;j++,c++,
var++)
00628 {
00629
int bit = byte & 1;
00630 byte >>= 1;
00631 vp[c]=
binomial_sample((1-noise_level)*bit+noise_level*fieldstats[
var].prob(1));
00632 }
00633 }
00634
for (
int b=0;b<
n_symbols;b++,
var++)
00635 {
00636
int byte = encoded_row[symbols_offset+b];
00637
int nv=
n_symbol_values[b];
00638 probs.
resize(nv);
00639
VMFieldStat& stat=fieldstats[
var];
00640
for (
int val=0;
val<nv;
val++)
00641
if (
val==byte)
00642 probs[
val]=(1-noise_level)+noise_level*stat.
prob(
val);
00643
else
00644 probs[
val]=noise_level*stat.
prob(
val);
00645 byte =
multinomial_sample(probs);
00646
if (
one_hot_encoding)
00647 {
00648
int n=n_symbol_values[b];
00649
for (
int j=0;j<n;j++) vp[c+j]=0;
00650 vp[c+byte]=1;
00651 c+=n;
00652 }
00653
else vp[c++]=byte;
00654 }
00655
unsigned char* fixed_point_numbers = &encoded_row[
fixedpoint_offset];
00656
for (
int j=0;j<
n_fixedpoint;j++,c++,
var++)
00657 {
00658
unsigned char *uc = &fixed_point_numbers[2*j];
00659
short_and_twobytes u;
00660 u.
twobytes[0]=uc[0];
00661 u.
twobytes[1]=uc[1];
00662
real decoded = u.
us*
delta[j]+
fixedpoint_min[j];
00663
00664
real rounded_decoded = rint(decoded);
00665
if (fabs(rounded_decoded-decoded)<1e-4)
00666 decoded = rounded_decoded;
00667
if (
var<
n_variables-n_last)
00668 {
00669
int ntry=0;
00670
do
00671 {
00672 vp[c]=decoded+noise_level*fieldstats[
var].stddev()*
normal_sample();
00673 ntry++;
00674
if (ntry>=100)
00675
PLERROR(
"CompactVMatrix::perturb:Something wrong in resampling, tried 100 times");
00676 }
00677
while (vp[c]<fixedpoint_min[j] || vp[c]>
fixedpoint_max[j]);
00678 }
00679
else
00680 vp[c]=decoded;
00681 }
00682 }
00683
00684
00685
00686
00687
00688
00689
00690
00691
00692
00693
00694
00695
00696
00697
00698
00699
00700
00701
00702
00703
00704
00705
00706
00707
00708
00709
00710
00711
00712
00713
00714
00715
00716
00717
00718
00719
00720
00721
00722
00723
00724
00725
00726
00727
00728
00729
00730
00731
00732
00733
00734 void CompactVMatrix::append(
CompactVMatrix* vm)
00735 {
00736
if (width_!=vm->
width())
00737
PLERROR(
"CompactVMatrix::append, incompatible width %d vs %d",
00738 width_,vm->
width());
00739
if (
row_n_bytes!=vm->
row_n_bytes)
00740
PLERROR(
"CompactVMatrix::append, incompatible row_n_bytes %d vs %d",
00741
row_n_bytes,vm->
row_n_bytes);
00742
if (
n_bits!=vm->
n_bits)
00743
PLERROR(
"CompactVMatrix::append, incompatible n_bits %d vs %d",
00744
n_bits,vm->
n_bits);
00745
if (
n_symbols!=vm->
n_symbols)
00746
PLERROR(
"CompactVMatrix::append, incompatible n_symbols %d vs %d",
00747
n_symbols,vm->
n_symbols);
00748
if (
n_fixedpoint!=vm->
n_fixedpoint)
00749
PLERROR(
"CompactVMatrix::append, incompatible n_fixedpoint %d vs %d",
00750
n_fixedpoint,vm->
n_fixedpoint);
00751
if (
n_symbol_values!=vm->
n_symbol_values)
00752 {
00753
00754
00755
PLearn::write(cerr,
n_symbol_values);
00756 cerr <<
endl;
00757
PLearn::write(cerr, vm->
n_symbol_values);
00758 cerr <<
endl;
00759
PLERROR(
"CompactVMatrix::append, incompatible n_symbol_values");
00760 }
00761
bool rescale =
false;
00762
for (
int j=0;j<
n_fixedpoint && !rescale;j++)
00763
if (
fixedpoint_min[j]>vm->
fixedpoint_min[j] ||
00764
fixedpoint_max[j]<vm->
fixedpoint_max[j]) rescale=
true;
00765
if (rescale)
00766 {
00767 cout <<
"The appended VMat has intervals that are wider than the current one." <<
endl;
00768 cout <<
"Start rescaling numeric variables fixed point representation." <<
endl;
00769
Vec new_min =
fixedpoint_min.
copy();
00770
Vec new_max =
fixedpoint_max.
copy();
00771
Vec new_delta =
delta.
copy();
00772
TVec<bool> change(
n_fixedpoint);
00773
for (
int j=0;j<
n_fixedpoint;j++)
00774 {
00775 change[j]=
false;
00776
if (
fixedpoint_min[j]>vm->
fixedpoint_min[j])
00777 {
00778 change[j]=
true;
00779 new_min[j]=vm->
fixedpoint_min[j];
00780 }
00781
if (
fixedpoint_max[j]<vm->
fixedpoint_max[j])
00782 {
00783 change[j]=
true;
00784 new_max[j]=vm->
fixedpoint_max[j];
00785 }
00786
if (change[j])
00787 new_delta[j]=(new_max[j]-new_min[j])/USHRT_MAX;
00788 }
00789
for (
int r=0;r<length_;r++)
00790 {
00791
unsigned char* encoded_row = &
data.
data[r*
row_n_bytes];
00792
unsigned char* fixed_point_numbers = &encoded_row[
fixedpoint_offset];
00793
for (
int j=0;j<n_fixedpoint;j++)
00794
if (change[j])
00795 {
00796
00797
unsigned char *uc = &fixed_point_numbers[2*j];
00798
short_and_twobytes u;
00799 u.
twobytes[0]=uc[0];
00800 u.
twobytes[1]=uc[1];
00801
real decoded = u.
us*
delta[j]+
fixedpoint_min[j];
00802
00803
real rounded_decoded = rint(decoded);
00804
if (fabs(rounded_decoded-decoded)<1e-4)
00805 decoded = rounded_decoded;
00806
00807 fixed_point_numbers[j]=(
unsigned char)((decoded-new_min[j])/new_delta[j]);
00808 }
00809 }
00810 cout <<
"DONE rescaling numeric variables fixed point representation." <<
endl;
00811
fixedpoint_min << new_min;
00812
fixedpoint_max << new_max;
00813
delta << new_delta;
00814 }
00815
int new_length=length_+vm->
length();
00816
data.
resize(
row_n_bytes*new_length);
00817
00818
Vec row(width_);
00819
bool old_vm_encoding = vm->
one_hot_encoding;
00820
bool old_encoding =
one_hot_encoding;
00821 vm->
one_hot_encoding=
false;
00822
setOneHotMode(
false);
00823
int old_length=length_;
00824 length_=new_length;
00825
for (
int r=0;r<vm->
length();r++)
00826 {
00827 vm->
getRow(r,row);
00828
putRow(old_length+r,row);
00829 }
00830 vm->
one_hot_encoding=old_vm_encoding;
00831
setOneHotMode(old_encoding);
00832 }
00833
00834 void CompactVMatrix::makeDeepCopyFromShallowCopy(map<const void*, void*>& copies)
00835 {
00836
deepCopyField(
data, copies);
00837
deepCopyField(
n_symbol_values, copies);
00838
deepCopyField(
fixedpoint_min, copies);
00839
deepCopyField(
fixedpoint_max, copies);
00840
deepCopyField(
variables_permutation, copies);
00841 }
00842
00843 }
00844