PLearn: SimpleDB.cc Source File

00001 // -*- C++ -*- 00002 00003 // SimpleDB.cc: Simple Database Representation (implementation) 00004 // 00005 // Copyright (C) 2000 Nicolas Chapados 00006 // 00007 // Redistribution and use in source and binary forms, with or without 00008 // modification, are permitted provided that the following conditions are met: 00009 // 00010 // 1. Redistributions of source code must retain the above copyright 00011 // notice, this list of conditions and the following disclaimer. 00012 // 00013 // 2. Redistributions in binary form must reproduce the above copyright 00014 // notice, this list of conditions and the following disclaimer in the 00015 // documentation and/or other materials provided with the distribution. 00016 // 00017 // 3. The name of the authors may not be used to endorse or promote 00018 // products derived from this software without specific prior written 00019 // permission. 00020 // 00021 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00022 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00023 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00024 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00025 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00026 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00027 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00028 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00029 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00030 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00031 // 00032 // This file is part of the PLearn library. For more information on the PLearn 00033 // library, go to the PLearn Web site at www.plearn.org 00034 00035 #define _GNU_SOURCE 1 00036 00037 #include "SimpleDB.h" 00038 #include <plearn/math/random.h> 00039 #include <plearn/base/stringutils.h> 00040 00041 namespace PLearn { 00042 using namespace std; 00043 00044 00045 //##### Static Members #################################################### 00046 00047 const char MissingString = '\0'; 00048 const unsigned char MissingCharacter = (unsigned char)SCHAR_MIN; 00049 const signed char MissingSignedChar = (signed char)SCHAR_MIN; 00050 const short MissingShort = SHRT_MIN; 00051 const int MissingInt = INT_MIN; 00052 const float MissingFloat = MISSING_VALUE; 00053 const double MissingDouble = MISSING_VALUE; 00054 const PDate MissingDate; // default ctor ==> missing date 00055 00056 00057 //##### Schema-related functions ########################################## 00058 00059 bool Schema::findColumn(const string& name, int& position, int& start, 00060 int& precision) const 00061 { 00062 const_iterator it = begin(), end = this->end(); 00063 position = start = precision = 0; 00064 00065 for (; it != end; start += it->precision, ++it, ++position) 00066 if (it->name == name) { 00067 precision = it->precision; 00068 break; 00069 } 00070 return (it == end)? false : true; 00071 } 00072 00073 00074 FieldPtr Schema::findColumn(int position) const 00075 { 00076 int orig_position = position; 00077 ptrdiff_t offset_= 0; 00078 const_iterator it=begin(), end=this->end(); 00079 for (; position && it != end; --position, ++it) 00080 offset_+= it->precision; 00081 if (it == end) 00082 PLERROR("Column %d does not exist in schema", 00083 orig_position); 00084 return FieldPtr(orig_position, offset_); 00085 } 00086 00087 00088 FieldPtr Schema::findColumn(const string& name) const 00089 { 00090 int position, start, precision; 00091 bool found = findColumn(name, position, start, precision); 00092 if (!found) 00093 PLERROR("Column %s does not exist in schema", 00094 name.c_str()); 00095 return FieldPtr(position, start); 00096 } 00097 00098 00099 //##### FieldValue ######################################################## 00100 00101 FieldValue::FieldValue() 00102 : field_type_(Unknown), precision_(0) 00103 {} 00104 00105 FieldValue::FieldValue(const FieldValue& fv) 00106 : field_type_(fv.field_type_), precision_(fv.precision_) 00107 { 00108 switch (field_type_) { 00109 case Unknown: 00110 break; 00111 00112 case StringType: 00113 { 00114 int stringlen = (int)strlen(fv.string_val_); 00115 if (stringlen+1 != precision_) 00116 PLERROR("Strings in a database field must contain a terminating null"); 00117 string_val_ = new char[precision_]; 00118 strcpy(string_val_, fv.string_val_); 00119 } 00120 break; 00121 00122 case CharacterType: 00123 case SignedCharType: 00124 case ShortType: 00125 case IntType: 00126 long_val_ = fv.long_val_; 00127 break; 00128 00129 case FloatType: 00130 case DoubleType: 00131 double_val_ = fv.double_val_; 00132 break; 00133 00134 case DateType: 00135 date_val_ = fv.date_val_; 00136 break; 00137 } 00138 } 00139 00140 FieldValue::~FieldValue() 00141 { 00142 switch(field_type_) { 00143 case StringType: 00144 delete[] string_val_; 00145 00146 case Unknown: 00147 case CharacterType: 00148 case SignedCharType: 00149 case ShortType: 00150 case IntType: 00151 case FloatType: 00152 case DoubleType: 00153 case DateType: 00154 break; 00155 } 00156 } 00157 00158 FieldValue::FieldValue(const char* str) 00159 : field_type_(StringType), precision_(strlen(str)+1) 00160 { 00161 string_val_ = new char[precision_]; 00162 strcpy(string_val_, str); 00163 } 00164 00165 FieldValue::FieldValue(unsigned char x) 00166 : field_type_(CharacterType), 00167 precision_(Field("",CharacterType).precision), 00168 long_val_(x) 00169 {} 00170 00171 FieldValue::FieldValue(signed char x) 00172 : field_type_(SignedCharType), 00173 precision_(Field("",SignedCharType).precision), 00174 long_val_(x) 00175 {} 00176 00177 FieldValue::FieldValue(short x) 00178 : field_type_(ShortType), 00179 precision_(Field("",ShortType).precision), 00180 long_val_(x) 00181 {} 00182 00183 FieldValue::FieldValue(int x) 00184 : field_type_(IntType), 00185 precision_(Field("",IntType).precision), 00186 long_val_(x) 00187 {} 00188 00189 FieldValue::FieldValue(float x) 00190 : field_type_(FloatType), 00191 precision_(Field("",FloatType).precision), 00192 double_val_(x) 00193 {} 00194 00195 FieldValue::FieldValue(double x) 00196 : field_type_(DoubleType), 00197 precision_(Field("",DoubleType).precision), 00198 double_val_(x) 00199 {} 00200 00201 FieldValue::FieldValue(const PDate& x) 00202 : field_type_(DateType), 00203 precision_(Field("",DateType).precision) 00204 { 00205 date_val_.year = x.year; 00206 date_val_.month = x.month; 00207 date_val_.day = x.day; 00208 } 00209 00210 bool FieldValue::isMissing() const 00211 { 00212 switch (field_type_) { 00213 case Unknown: return true; 00214 case StringType: return !string_val_ || string_val_[0] == MissingString; 00215 case CharacterType: return (unsigned char)long_val_ == MissingCharacter; 00216 case SignedCharType:return (signed char) long_val_ == MissingSignedChar; 00217 case ShortType: return (short) long_val_ == MissingShort; 00218 case IntType: return (int) long_val_ == MissingInt; 00219 case FloatType: 00220 case DoubleType: return isnan(double_val_); 00221 case DateType: 00222 return date_val_.year == MissingDate.year && 00223 date_val_.month == MissingDate.month && 00224 date_val_.day == MissingDate.day; 00225 } 00226 return false; 00227 } 00228 00229 void FieldValue::setMissing() 00230 { 00231 switch (field_type_) { 00232 case Unknown: break; 00233 case StringType: if (string_val_) string_val_[0] = MissingString; break; 00234 case CharacterType: long_val_ = long(MissingCharacter); break; 00235 case SignedCharType:long_val_ = long(MissingSignedChar); break; 00236 case ShortType: long_val_ = long(MissingShort); break; 00237 case IntType: long_val_ = long(MissingInt); break; 00238 case FloatType: 00239 case DoubleType: double_val_ = MissingDouble; break; 00240 case DateType: 00241 date_val_.year = MissingDate.year; 00242 date_val_.month = MissingDate.month; 00243 date_val_.day = MissingDate.day; 00244 break; 00245 } 00246 } 00247 00248 string FieldValue::toString() const 00249 { 00250 if (isMissing()) 00251 return ""; 00252 switch (field_type_) { 00253 case Unknown: return ""; 00254 case StringType: return space_to_underscore(string_val_); 00255 case CharacterType: return string(1,char(long_val_)); 00256 case SignedCharType: 00257 case ShortType: 00258 case IntType: return tostring(long_val_); 00259 case FloatType: 00260 case DoubleType: return tostring(double_val_); 00261 case DateType: 00262 return PDate(date_val_.year, date_val_.month, date_val_.day).info(); 00263 } 00264 return ""; 00265 } 00266 00267 double FieldValue::toDouble() const 00268 { 00269 if (isMissing()) 00270 return MISSING_VALUE; 00271 switch (field_type_) { 00272 case Unknown: 00273 return MISSING_VALUE; 00274 00275 case StringType: 00276 case CharacterType: 00277 PLERROR("Cannot convert string or character field to double"); 00278 break; 00279 00280 case SignedCharType: 00281 case ShortType: 00282 case IntType: 00283 return double(long_val_); 00284 00285 case FloatType: 00286 case DoubleType: 00287 return double_val_; 00288 00289 case DateType: 00290 return double(date_to_float(PDate(date_val_.year, date_val_.month, 00291 date_val_.day))); 00292 } 00293 return MISSING_VALUE; 00294 } 00295 00296 PDate FieldValue::toDate() const 00297 { 00298 switch(field_type_) { 00299 case DateType: 00300 return PDate(date_val_.year, date_val_.month, date_val_.day); 00301 00302 default: 00303 PLERROR("Cannot convert non-date field type to a date"); 00304 } 00305 return PDate(); 00306 } 00307 00308 FieldValue& FieldValue::operator=(FieldValue rhs) 00309 { 00310 swap(rhs); 00311 return *this; 00312 } 00313 00314 bool FieldValue::operator==(const FieldValue& rhs) const 00315 { 00316 const FieldValue& lhs = *this; 00317 const FieldType& lhs_type = field_type_; 00318 const FieldType& rhs_type = rhs.field_type_; 00319 00320 // Strings 00321 if (lhs_type == StringType && rhs_type == StringType) 00322 return !strcmp(lhs.string_val_, rhs.string_val_); 00323 else if (lhs_type == StringType || rhs_type == StringType) 00324 PLERROR("A string can be compared for equality only with another string"); 00325 00326 // Dates 00327 else if (lhs_type == DateType && rhs_type == DateType) 00328 return 00329 PDate(lhs.date_val_.year, lhs.date_val_.month, lhs.date_val_.day) == 00330 PDate(rhs.date_val_.year, rhs.date_val_.month, rhs.date_val_.day); 00331 else if (lhs_type == DateType || rhs_type == DateType) 00332 PLERROR("A date can be compared for equality only with another date"); 00333 00334 // Two integrals 00335 else if (lhs.isIntegral() && rhs.isIntegral()) 00336 return lhs.long_val_ == rhs.long_val_; 00337 00338 // Two floating 00339 else if (lhs.isFloating() && rhs.isFloating()) 00340 return lhs.double_val_ == rhs.double_val_; 00341 00342 // Cross-numeric 00343 else if (lhs.isIntegral() && rhs.isFloating()) 00344 return lhs.long_val_ == rhs.double_val_; 00345 else if (lhs.isFloating() && lhs.isIntegral()) 00346 return lhs.double_val_ == rhs.long_val_; 00347 00348 // Otherwise, PLERROR(should not happen) 00349 else 00350 PLERROR("Unrecognized case in equality testing between FieldValues"); 00351 00352 return false; // shut up the compiler 00353 } 00354 00355 bool FieldValue::operator<(const FieldValue& rhs) const 00356 { 00357 const FieldValue& lhs = *this; 00358 const FieldType& lhs_type = field_type_; 00359 const FieldType& rhs_type = rhs.field_type_; 00360 00361 // Strings 00362 if (lhs_type == StringType && rhs_type == StringType) 00363 return strcmp(lhs.string_val_, rhs.string_val_) < 0; 00364 else if (lhs_type == StringType || rhs_type == StringType) 00365 PLERROR("A string can be relationally compared only with another string"); 00366 00367 // Dates 00368 else if (lhs_type == DateType && rhs_type == DateType) 00369 return 00370 PDate(lhs.date_val_.year, lhs.date_val_.month, lhs.date_val_.day) < 00371 PDate(rhs.date_val_.year, rhs.date_val_.month, rhs.date_val_.day); 00372 else if (lhs_type == DateType || rhs_type == DateType) 00373 PLERROR("A date can be relationally compared only with another date"); 00374 00375 // Two integrals 00376 else if (lhs.isIntegral() && rhs.isIntegral()) 00377 return lhs.long_val_ < rhs.long_val_; 00378 00379 // Two floating 00380 else if (lhs.isFloating() && rhs.isFloating()) 00381 return lhs.double_val_ < rhs.double_val_; 00382 00383 // Cross-numeric 00384 else if (lhs.isIntegral() && rhs.isFloating()) 00385 return lhs.long_val_ < rhs.double_val_; 00386 else if (lhs.isFloating() && lhs.isIntegral()) 00387 return lhs.double_val_ < rhs.long_val_; 00388 00389 // Otherwise, PLERROR(should not happen) 00390 else 00391 PLERROR("Unrecognized case in relational testing between FieldValues"); 00392 00393 return false; // shut up the compiler 00394 } 00395 00396 FieldValue FieldValue::operator+(const FieldValue& rhs) const 00397 { 00398 const FieldValue& lhs = *this; 00399 const FieldType& lhs_type = field_type_; 00400 const FieldType& rhs_type = rhs.field_type_; 00401 00402 // Arithmetic addition is not defined for strings or dates or characters 00403 if (lhs_type == StringType || rhs_type == StringType) 00404 PLERROR("Strings cannot be added"); 00405 else if (lhs_type == CharacterType || rhs_type == CharacterType) 00406 PLERROR("Characters cannot be added"); 00407 else if (lhs_type == DateType || rhs_type == DateType) 00408 PLERROR("Dates cannot be added"); 00409 00410 // Twice the same type 00411 else if (lhs.isIntegral() && rhs.isIntegral()) 00412 return FieldValue(int(lhs.long_val_ + rhs.long_val_)); 00413 else if (lhs.isFloating() && rhs.isFloating()) 00414 return FieldValue(double(lhs.double_val_ + rhs.double_val_)); 00415 00416 // Cross-numeric : convert to double 00417 else if (lhs.isIntegral() && rhs.isFloating()) 00418 return FieldValue(double(lhs.long_val_ + rhs.double_val_)); 00419 else if (lhs.isFloating() && rhs.isIntegral()) 00420 return FieldValue(double(lhs.double_val_ + rhs.long_val_)); 00421 00422 // Otherwise, PLERROR(should not happen) 00423 else 00424 PLERROR("Unrecognized case in addition between FieldValues"); 00425 00426 return FieldValue(); 00427 } 00428 00429 FieldValue FieldValue::operator-(const FieldValue& rhs) const 00430 { 00431 const FieldValue& lhs = *this; 00432 const FieldType& lhs_type = field_type_; 00433 const FieldType& rhs_type = rhs.field_type_; 00434 00435 // Arithmetic subtraction is not defined for strings or characters 00436 if (lhs_type == StringType || rhs_type == StringType) 00437 PLERROR("Strings cannot be subtracted"); 00438 else if (lhs_type == CharacterType || rhs_type == CharacterType) 00439 PLERROR("Characters cannot be subtracted"); 00440 00441 // For dates, return the number of days between two dates 00442 else if (lhs_type == DateType && rhs_type == DateType) 00443 return FieldValue(int( 00444 PDate(lhs.date_val_.year, lhs.date_val_.month, lhs.date_val_.day) - 00445 PDate(rhs.date_val_.year, rhs.date_val_.month, rhs.date_val_.day))); 00446 else if (lhs_type == DateType || rhs_type == DateType) 00447 PLERROR("A date and a non-date cannot be subtracted"); 00448 00449 // Twice the same type 00450 else if (lhs.isIntegral() && rhs.isIntegral()) 00451 return FieldValue(int(lhs.long_val_ - rhs.long_val_)); 00452 else if (lhs.isFloating() && rhs.isFloating()) 00453 return FieldValue(double(lhs.double_val_ - rhs.double_val_)); 00454 00455 // Cross-numeric : convert to double 00456 else if (lhs.isIntegral() && rhs.isFloating()) 00457 return FieldValue(double(lhs.long_val_ - rhs.double_val_)); 00458 else if (lhs.isFloating() && rhs.isIntegral()) 00459 return FieldValue(double(lhs.double_val_ - rhs.long_val_)); 00460 00461 // Otherwise, PLERROR(should not happen) 00462 else 00463 PLERROR("Unrecognized case in subtraction between FieldValues"); 00464 00465 return FieldValue(); 00466 } 00467 00468 FieldValue FieldValue::operator*(const FieldValue& rhs) const 00469 { 00470 const FieldValue& lhs = *this; 00471 const FieldType& lhs_type = field_type_; 00472 const FieldType& rhs_type = rhs.field_type_; 00473 00474 // Arithmetic addition is not defined for strings or dates or characters 00475 if (lhs_type == StringType || rhs_type == StringType) 00476 PLERROR("Strings cannot be multiplied"); 00477 else if (lhs_type == CharacterType || rhs_type == CharacterType) 00478 PLERROR("Characters cannot be multiplied"); 00479 else if (lhs_type == DateType || rhs_type == DateType) 00480 PLERROR("Dates cannot be multiplied"); 00481 00482 // Twice the same type 00483 else if (lhs.isIntegral() && rhs.isIntegral()) 00484 return FieldValue(int(lhs.long_val_ * rhs.long_val_)); 00485 else if (lhs.isFloating() && rhs.isFloating()) 00486 return FieldValue(double(lhs.double_val_ * rhs.double_val_)); 00487 00488 // Cross-numeric : convert to double 00489 else if (lhs.isIntegral() && rhs.isFloating()) 00490 return FieldValue(double(lhs.long_val_ * rhs.double_val_)); 00491 else if (lhs.isFloating() && rhs.isIntegral()) 00492 return FieldValue(double(lhs.double_val_ * rhs.long_val_)); 00493 00494 // Otherwise, PLERROR(should not happen) 00495 else 00496 PLERROR("Unrecognized case in multiplication between FieldValues"); 00497 00498 return FieldValue(); 00499 } 00500 00501 FieldValue FieldValue::operator/(const FieldValue& rhs) const 00502 { 00503 const FieldValue& lhs = *this; 00504 const FieldType& lhs_type = field_type_; 00505 const FieldType& rhs_type = rhs.field_type_; 00506 00507 // Arithmetic addition is not defined for strings or dates or characters 00508 if (lhs_type == StringType || rhs_type == StringType) 00509 PLERROR("Strings cannot be divided"); 00510 else if (lhs_type == CharacterType || rhs_type == CharacterType) 00511 PLERROR("Characters cannot be divided"); 00512 else if (lhs_type == DateType || rhs_type == DateType) 00513 PLERROR("Dates cannot be divided"); 00514 00515 // Twice the same type 00516 else if (lhs.isIntegral() && rhs.isIntegral()) 00517 return FieldValue(int(lhs.long_val_ / rhs.long_val_)); 00518 else if (lhs.isFloating() && rhs.isFloating()) 00519 return FieldValue(double(lhs.double_val_ / rhs.double_val_)); 00520 00521 // Cross-numeric : convert to double 00522 else if (lhs.isIntegral() && rhs.isFloating()) 00523 return FieldValue(double(lhs.long_val_ / rhs.double_val_)); 00524 else if (lhs.isFloating() && rhs.isIntegral()) 00525 return FieldValue(double(lhs.double_val_ / rhs.long_val_)); 00526 00527 // Otherwise, PLERROR(should not happen) 00528 else 00529 PLERROR("Unrecognized case in division between FieldValues"); 00530 00531 return FieldValue(); 00532 } 00533 00534 void FieldValue::swap(FieldValue& rhs) 00535 { 00536 std::swap(field_type_, rhs.field_type_); 00537 std::swap(precision_, rhs.precision_); 00538 switch(field_type_) { 00539 case Unknown: break; 00540 case StringType: std::swap(string_val_, rhs.string_val_); break; 00541 case CharacterType: 00542 case SignedCharType: 00543 case ShortType: 00544 case IntType: std::swap(long_val_, rhs.long_val_); break; 00545 case FloatType: 00546 case DoubleType: std::swap(double_val_, rhs.double_val_); break; 00547 case DateType: std::swap(date_val_, rhs.date_val_); break; 00548 } 00549 } 00550 00551 ostream& operator<<(ostream& os, const FieldValue& ft) 00552 { 00553 // quite frankly too simple for now 00554 return os << ft.toString(); 00555 } 00556 00557 00558 //##### Row-Iterator-Related functions #################################### 00559 00560 bool RowIterator::isMissing() const 00561 { 00562 if (const char* x = asString()) 00563 return x[0] == MissingString; 00564 else if (const unsigned char* x = asCharacter()) 00565 return x[0] == MissingCharacter; 00566 else if (const signed char* x = asSignedChar()) 00567 return x[0] == MissingSignedChar; 00568 else if (const short* x = asShort()) 00569 return *x == MissingShort; 00570 else if (const int* x = asInt()) 00571 return *x == MissingInt; 00572 else if (const float* x = asFloat()) 00573 return isnan(*x); 00574 else if (const double* x = asDouble()) 00575 return isnan(*x); 00576 else if (const PDate* x = asDate()) 00577 return *x == MissingDate; 00578 else 00579 return false; 00580 } 00581 00582 void RowIterator::setMissing() 00583 { 00584 if (char* x = asString()) 00585 *x = MissingString; 00586 else if (unsigned char* x = asCharacter()) 00587 *x = MissingCharacter; 00588 else if (signed char* x = asSignedChar()) 00589 *x = MissingSignedChar; 00590 else if (short* x = asShort()) 00591 *x = MissingShort; 00592 else if (int* x = asInt()) 00593 *x = MissingInt; 00594 else if (float* x = asFloat()) 00595 *x = MissingFloat; 00596 else if (double* x = asDouble()) 00597 *x = MissingDouble; 00598 else if (PDate* x = asDate()) 00599 *x = MissingDate; 00600 } 00601 00602 int RowIterator::char_width() const 00603 { 00604 int w = 0; 00605 if (isString()) 00606 w = precision()-1; // minus terminating null 00607 else if (isCharacter()) 00608 w = 1; // 'A' 00609 else if (isSignedChar()) 00610 w = 4; // -127 00611 else if (isShort()) 00612 w = 6; // -32767 00613 else if (isInt()) 00614 w = 11; // -2 billion 00615 else if (isFloat()) 00616 w = 8; // -precision + decimal point 00617 else if (isDouble()) 00618 w = 8; // -precision + decimal point 00619 else if (isDate()) 00620 w = 10; // YYYY/MM/DD 00621 else 00622 PLERROR("Unknown type for iterator, field %d (%s)",curfield,name().c_str()); 00623 00624 return std::max(int(w),int(name().size())); 00625 } 00626 00627 double RowIterator::toDouble() const 00628 { 00629 if (isMissing()) 00630 return MISSING_VALUE; 00631 if (asString()) 00632 PLERROR("Cannot convert string to double"); 00633 if (asCharacter()) 00634 PLERROR("Cannot convert character to double"); 00635 if (const signed char* x = asSignedChar()) 00636 return double(*x); 00637 if (const short* x = asShort()) 00638 return double(*x); 00639 if (const int* x = asInt()) 00640 return double(*x); 00641 if (const float* x = asFloat()) 00642 return double(*x); 00643 if (const double* x = asDouble()) 00644 return *x; 00645 if (const PDate* x = asDate()) 00646 return double(date_to_float(*x)); 00647 return MISSING_VALUE; 00648 } 00649 00650 string RowIterator::toString() const 00651 { 00652 if (isMissing()) 00653 return ""; 00654 if (const char* x = asString()) 00655 return space_to_underscore(x); 00656 if (const unsigned char* x = asCharacter()) 00657 return string(1,char(*x)); 00658 if (const signed char* x = asSignedChar()) 00659 return tostring(int(*x)); 00660 if (const short* x = asShort()) 00661 return tostring(*x); 00662 if (const int* x = asInt()) 00663 return tostring(*x); 00664 if (const float* x = asFloat()) 00665 return tostring(*x); 00666 if (const double* x = asDouble()) 00667 return tostring(*x); 00668 if (const PDate* x = asDate()) 00669 return x->info(); 00670 return ""; 00671 } 00672 00673 double todouble(const RowIterator& it) 00674 { 00675 return it.toDouble(); 00676 } 00677 00678 string tostring(const RowIterator& it) 00679 { 00680 return it.toString(); 00681 } 00682 00683 00684 //##### FieldRowRef ######################################################### 00685 00686 FieldRowRef::operator FieldValue() const 00687 { 00688 if (const char* x = it_.asString()) 00689 return FieldValue(x); 00690 if (const unsigned char* x = it_.asCharacter()) 00691 return FieldValue(*x); 00692 if (const signed char* x = it_.asSignedChar()) 00693 return FieldValue(*x); 00694 if (const short* x = it_.asShort()) 00695 return FieldValue(*x); 00696 if (const int* x = it_.asInt()) 00697 return FieldValue(*x); 00698 if (const float* x = it_.asFloat()) 00699 return FieldValue(*x); 00700 if (const double* x = it_.asDouble()) 00701 return FieldValue(*x); 00702 if (const PDate* x = it_.asDate()) 00703 return FieldValue(*x); 00704 return FieldValue(); 00705 } 00706 00707 // This assignment operator is complicated by the fact that the LHS type 00708 // may not have anything to do with the RHS type. Appropriate 00709 // conversions must be enacted. 00710 FieldRowRef& FieldRowRef::operator=(const FieldValue& rhs) 00711 { 00712 // Strings ==> convert anything into string form 00713 if (char* x = it_.asString()) { 00714 strncpy(x, rhs.toString().c_str(), it_.precision()); 00715 x[it_.precision()-1] = '\0'; 00716 } 00717 else if (unsigned char* x = it_.asCharacter()) { 00718 if (rhs.isIntegral()) 00719 *x = (unsigned char)rhs.long_val_; 00720 else if (rhs.isFloating()) 00721 *x = (unsigned char)rhs.double_val_; 00722 else 00723 PLERROR("Cannot convert a string or a date into an unsigned character"); 00724 } 00725 else if (signed char* x = it_.asSignedChar()) { 00726 if (rhs.isIntegral()) 00727 *x = (signed char)rhs.long_val_; 00728 else if (rhs.isFloating()) 00729 *x = (signed char)rhs.double_val_; 00730 else 00731 PLERROR("Cannot convert a string or a date into a signed character"); 00732 } 00733 else if (short* x = it_.asShort()) { 00734 if (rhs.isIntegral()) 00735 *x = (short)rhs.long_val_; 00736 else if (rhs.isFloating()) 00737 *x = (short)rhs.double_val_; 00738 else 00739 PLERROR("Cannot convert a string or a date into a short"); 00740 } 00741 else if (int* x = it_.asInt()) { 00742 if (rhs.isIntegral()) 00743 *x = (int)rhs.long_val_; 00744 else if (rhs.isFloating()) 00745 *x = (int)rhs.double_val_; 00746 else 00747 PLERROR("Cannot convert a string or a date into an int"); 00748 } 00749 else if (float* x = it_.asFloat()) { 00750 if (rhs.isIntegral()) 00751 *x = (float)rhs.long_val_; 00752 else if (rhs.isFloating()) 00753 *x = (float)rhs.double_val_; 00754 else 00755 PLERROR("Cannot convert a string or a date into a float"); 00756 } 00757 else if (double* x = it_.asDouble()) { 00758 if (rhs.isIntegral()) 00759 *x = (double)rhs.long_val_; 00760 else if (rhs.isFloating()) 00761 *x = (double)rhs.double_val_; 00762 else 00763 PLERROR("Cannot convert a string or a date into a double"); 00764 } 00765 else if (PDate* x = it_.asDate()) { 00766 if (rhs.isDate()) 00767 *x = PDate(rhs.date_val_.year, rhs.date_val_.month, 00768 rhs.date_val_.day); 00769 else 00770 PLERROR("Cannot convert a non-date into a date"); 00771 } 00772 else 00773 PLERROR("Unrecognized case in assignment in FieldRowRef from FieldValue"); 00774 00775 return *this; 00776 } 00777 00778 00779 //##### Row-Related functions ############################################# 00780 00781 Row::Row(const Schema* s) : schema(s) 00782 { 00783 // Compute the total size of a row 00784 int n=0; 00785 Schema::const_iterator it = schema->begin(), end = schema->end(); 00786 for ( ; it != end; ++it ) { 00787 n += it->precision; 00788 } 00789 rawrow.resize(n, '\0'); // zero-initialize it 00790 } 00791 00792 void Row::sanitize() const 00793 { 00794 // The sanitization operation canonicalizes all fields in the row. 00795 // Should be called before writing it to disk. This enables indexing 00796 // and matching operations to find rows quickly simply by comparing 00797 // byte vectors. At the moment, the only sanity check is to zero-fill 00798 // all character strings beyond the initial null until the precision of 00799 // their field. 00800 00801 Row* This = const_cast<Row*>(this); 00802 iterator it = This->begin(), end = This->end(); 00803 for ( ; it != end; ++it ) { 00804 if (char *x = it.asString()) { 00805 int prec = it.precision(); 00806 bool clearing = false; 00807 for ( ; prec; ++x, --prec) 00808 if (clearing) 00809 *x = '\0'; 00810 else if (*x == '\0') 00811 clearing = true; 00812 } 00813 } 00814 } 00815 00816 Row::iterator Row::operator[](int fieldNumber) 00817 { 00818 iterator it=this->begin(), end=this->end(); 00819 for (; fieldNumber && it != end; --fieldNumber, ++it) 00820 ; 00821 return it; 00822 } 00823 00824 Row::iterator Row::operator[](string fieldName) 00825 { 00826 iterator it=this->begin(), end=this->end(); 00827 Schema::const_iterator scit=schema->begin(), scend=schema->end(); 00828 for(; it != end && scit != scend; ++it, ++scit) 00829 if (scit->name == fieldName) 00830 break; 00831 return it; 00832 } 00833 00834 void printFieldName(ostream& o, const Row::iterator& field) 00835 { 00836 o.setf(ios::right, ios::adjustfield); 00837 o.fill(' '); 00838 o.width(field.char_width()); 00839 o << field.name().c_str(); 00840 } 00841 00842 void printFieldNames(ostream& o, const Row& rowc) 00843 { 00844 Row& row = const_cast<Row&>(rowc); 00845 Row::const_iterator it = row.begin(), end = row.end(); 00846 00847 while(it!=end) 00848 { 00849 printFieldName(o,it); 00850 o << " | "; 00851 ++it; 00852 } 00853 o << endl; 00854 } 00855 00856 ostream& operator<<(ostream& o, const Row::iterator& field) 00857 { 00858 o.setf(ios::right, ios::adjustfield); 00859 o.fill(' '); 00860 o.width(field.char_width()); 00861 00862 // cout << "[" << field.char_width() << "]" << endl; 00863 00864 if (field.isMissing()) 00865 o << " "; 00866 else if (const char* x = field.asString()) 00867 o << x; 00868 else if (const unsigned char* x = field.asCharacter()) 00869 { 00870 if (isprint(*x)) 00871 { 00872 // couldn't get the formatting using ostream.width() to work so I'm 00873 // using this... 00874 o.width(0); 00875 o << center(string(1,*x),field.char_width()); 00876 } 00877 else 00878 { 00879 o.setf(ios::left, ios::adjustfield); 00880 o.width(0); 00881 o << "0x"; 00882 o.width(field.char_width()-2); 00883 o << hex << int(*x) << dec; 00884 o.setf(ios::right, ios::adjustfield); 00885 } 00886 } 00887 else if (const signed char* x = field.asSignedChar()) 00888 o << int(*x); 00889 else if (const short* x = field.asShort()) 00890 o << *x; 00891 else if (const int* x = field.asInt()) 00892 o << *x; 00893 else if (const float* x = field.asFloat()) 00894 { 00895 o.setf(ios::fmtflags(0), ios::floatfield); 00896 o.precision(6); 00897 o << *x; 00898 } 00899 else if (const double* x = field.asDouble()) 00900 { 00901 o.setf(ios::fmtflags(0), ios::floatfield); 00902 o.precision(6); 00903 o << *x; 00904 } 00905 else if (const PDate* x = field.asDate()) 00906 { 00907 o.width(0); 00908 o << center(x->info(),field.char_width()); 00909 } 00910 else 00911 PLERROR("Unknown field type"); 00912 00913 return o; 00914 } 00915 00916 ostream& operator<<(ostream& o, const Row& rowc) 00917 { 00918 Row& row = const_cast<Row&>(rowc); 00919 Row::const_iterator it = row.begin(), end = row.end(); 00920 00921 while(it!=end) 00922 { 00923 o << it << " | "; 00924 ++it; 00925 } 00926 o << endl; 00927 return o; 00928 } 00929 00930 // [Having a SDB with rows 0,...,n-1]: 00931 // swap(1,n-1) 00932 // swap(3,n-3) 00933 // ... 00934 // swap(n/2,n-n/2) 00935 // Not quite a random shuffle, but much more efficient 00936 // than randomShuffleRows (better use of cache) 00937 void halfShuffleRows(SDB& sdb) 00938 { 00939 Row rowi(&sdb.getSchema()); 00940 Row rowj(&sdb.getSchema()); 00941 int length = int(sdb.length()); 00942 for(int k=1; k<length/2; k+=2) 00943 { 00944 if(k%100000==1) 00945 cerr << k << endl; 00946 sdb.getInRow(k,rowi); 00947 sdb.getInRow(length-k,rowj); 00948 sdb.setRow(rowi,length-k); 00949 sdb.setRow(rowj,k); 00950 } 00951 } 00952 00953 // extremely slow for huge databases: cannot use cache in an efficient way 00954 void randomShuffleRows(SDB& sdb) 00955 { 00956 Row rowi(&sdb.getSchema()); 00957 Row rowj(&sdb.getSchema()); 00958 int length = int(sdb.length()); 00959 for(int i=0; i<sdb.length(); i++) 00960 { 00961 if(i%1000==0) 00962 cerr << i << endl; 00963 int j = i+int(uniform_sample()*(length-i)); 00964 sdb.getInRow(i,rowi); 00965 sdb.getInRow(j,rowj); 00966 sdb.setRow(rowi,j); 00967 sdb.setRow(rowj,i); 00968 } 00969 } 00970 00971 } // end of namespace PLearn