Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

SimpleDB.h

Go to the documentation of this file.
00001 // -*- C++ -*- 00002 00003 // SimpleDB.cc: Simple Database Representation (implementation) 00004 // 00005 // Copyright (C) 2000 Nicolas Chapados 00006 // 00007 // Redistribution and use in source and binary forms, with or without 00008 // modification, are permitted provided that the following conditions are met: 00009 // 00010 // 1. Redistributions of source code must retain the above copyright 00011 // notice, this list of conditions and the following disclaimer. 00012 // 00013 // 2. Redistributions in binary form must reproduce the above copyright 00014 // notice, this list of conditions and the following disclaimer in the 00015 // documentation and/or other materials provided with the distribution. 00016 // 00017 // 3. The name of the authors may not be used to endorse or promote 00018 // products derived from this software without specific prior written 00019 // permission. 00020 // 00021 // THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 00022 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00023 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 00024 // NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00025 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 00026 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00027 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00028 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00029 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00030 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00031 // 00032 // This file is part of the PLearn library. For more information on the PLearn 00033 // library, go to the PLearn Web site at www.plearn.org 00034 00037 #ifndef SIMPLEDB_H 00038 #define SIMPLEDB_H 00039 00040 00042 #include <limits.h> 00043 #include <string> 00044 #include <vector> 00045 #include <iostream> 00046 //#include <iomanip> 00047 #include <stdexcept> 00048 #include <typeinfo> 00049 //#include <algorithm> 00050 00051 // norman: added check for WIN32 00052 #ifndef WIN32 00053 #include <unistd.h> 00054 #else 00055 #include <assert.h> 00056 #include <io.h> 00057 #endif 00058 00059 #include <fcntl.h> 00060 #include <errno.h> 00061 #include <stdlib.h> 00062 //#include <ctype.h> 00063 00064 00066 //#include "general.h" 00067 #include <plearn/base/stringutils.h> 00068 //#include "random.h" 00069 #include <plearn/base/PDate.h> 00070 #include <plearn/math/Hash.h> 00071 #include <plearn/base/TinyVector.h> 00072 00073 #ifdef WIN32 00074 // norman: potentially dangerous if there is a function called with the same name in this 00075 // file. Beware! 00076 #define open _open 00077 #define close _close 00078 #define lseek _lseek 00079 #define read _read 00080 #define write _write 00081 #define unlink _unlink 00082 #endif 00083 00084 namespace PLearn { 00085 using namespace std; 00086 00088 class Row; 00089 00090 00091 //############################## CLASS FIELD ########################### 00099 // StringType length of string 00100 // (including null terminator) 00101 // CharacterType 1 == unsigned char 00102 // SignedCharType 1 == signed char 00103 // ShortType 2 == signed short 00104 // IntType 4 == signed int 00105 // FloatType 4 == float 00106 // DoubleType 8 == double 00107 // DateType 4 == PDate 00108 enum FieldType { 00109 Unknown = 0, 00110 StringType, 00111 CharacterType, 00112 SignedCharType, 00113 ShortType, 00114 IntType, 00115 FloatType, 00116 DoubleType, 00117 DateType 00118 }; 00119 00120 struct Field { 00121 Field() : name(), field_type(Unknown), precision() {} 00122 Field(string name, FieldType t, int p = 0) 00123 : name(name), field_type(t) 00124 { 00127 switch(field_type) { 00128 case Unknown: precision=0; break; 00129 case StringType: precision=p; break; 00130 case CharacterType: precision=1; break; 00131 case SignedCharType: precision=1; break; 00132 case ShortType: precision=2; break; 00133 case IntType: precision=4; break; 00134 case FloatType: precision=4; break; 00135 case DoubleType: precision=8; break; 00136 case DateType: precision=4; break; 00137 default: 00138 PLERROR("Unknown field type %d with name %s", 00139 int(field_type), name.c_str()); 00140 } 00141 if (sizeof(PDate) != 4) 00142 PLERROR("A PLearn PDate must have sizeof equal to 4"); 00143 } 00144 00145 bool operator==(const Field& x) const { 00146 return name==x.name && 00147 field_type == x.field_type && 00148 precision == x.precision; 00149 } 00150 00151 string name; 00152 FieldType field_type; 00153 int precision; 00154 }; 00155 00157 extern const char MissingString; 00158 extern const unsigned char MissingCharacter; 00159 extern const signed char MissingSignedChar; 00160 extern const short MissingShort; 00161 extern const int MissingInt; 00162 extern const float MissingFloat; 00163 extern const double MissingDouble; 00164 extern const PDate MissingDate; 00165 00166 00167 //############################# CLASS FIELDPTR ######################### 00180 class FieldPtr { 00181 friend class Row; 00182 friend class Schema; 00183 00184 public: 00186 FieldPtr() : field_index_(-1), offset_(-1) {} 00187 00189 00191 int field_index() const { 00192 return field_index_; 00193 } 00194 00195 ptrdiff_t offset() const { 00196 return offset_; 00197 } 00198 00200 operator bool() const { 00201 return field_index_ >= 0; 00202 } 00203 00205 bool operator!() const { 00206 return field_index_ == -1; 00207 } 00208 00210 bool operator==(const FieldPtr& x) const { 00211 return field_index_ == x.field_index_ && 00212 offset_ == x.offset_; 00213 } 00214 00215 bool operator!=(const FieldPtr& x) const { 00216 return !(*this == x); 00217 } 00218 00219 private: 00221 FieldPtr(int fi, ptrdiff_t o) : field_index_(fi), offset_(o) {} 00222 00223 private: 00224 int field_index_; 00225 ptrdiff_t offset_; 00226 }; 00227 00228 00229 //############################ CLASS FIELDVALUE ########################## 00236 class FieldValue 00237 { 00238 friend class FieldRowRef; 00239 00240 public: 00242 FieldValue(); 00243 FieldValue(const FieldValue& fv); 00244 ~FieldValue(); 00245 00247 explicit FieldValue(const char*); 00248 explicit FieldValue(unsigned char); 00249 explicit FieldValue(signed char); 00250 explicit FieldValue(short); 00251 explicit FieldValue(int); 00252 explicit FieldValue(float); 00253 explicit FieldValue(double); 00254 explicit FieldValue(const PDate&); 00255 00257 bool isMissing() const; 00258 void setMissing(); 00259 00261 string toString() const; 00262 double toDouble() const; 00263 PDate toDate() const; 00264 00265 operator double() const { return toDouble(); } 00266 operator float() const { return float(toDouble()); } 00267 operator int() const { return int(toDouble()); } 00268 operator string() const { return toString(); } 00269 operator PDate() const { return toDate(); } 00270 00272 00274 FieldValue& operator=(FieldValue); 00275 00277 bool operator==(const FieldValue&) const; 00278 bool operator< (const FieldValue&) const; 00279 00280 bool operator!=(const FieldValue& rhs) const { 00281 return !(*this == rhs); 00282 } 00283 00284 bool operator<=(const FieldValue& rhs) const { 00285 return (*this == rhs) || (*this < rhs); 00286 } 00287 00288 bool operator> (const FieldValue& rhs) const { 00289 return !(*this <= rhs); 00290 } 00291 00292 bool operator>=(const FieldValue& rhs) const { 00293 return !(*this < rhs); 00294 } 00295 00297 FieldValue operator+(const FieldValue&) const; 00298 FieldValue operator-(const FieldValue&) const; 00299 FieldValue operator*(const FieldValue&) const; 00300 FieldValue operator/(const FieldValue&) const; 00301 00303 void swap(FieldValue&); 00304 00306 bool isString() const { 00307 return field_type_ == StringType; 00308 } 00309 00310 bool isIntegral() const { 00311 return 00312 field_type_ == CharacterType || field_type_ == SignedCharType || 00313 field_type_ == ShortType || field_type_ == IntType; 00314 } 00315 00316 bool isFloating() const { 00317 return field_type_ == FloatType || field_type_ == DoubleType; 00318 } 00319 00320 bool isDate() const { 00321 return field_type_ == DateType; 00322 } 00323 00324 private: 00325 FieldType field_type_; 00326 int precision_; 00327 struct DateVal_t { 00328 short year; 00329 unsigned char month, day; 00330 }; 00331 union { 00332 char* string_val_; 00333 long long_val_; 00334 double double_val_; 00335 struct DateVal_t date_val_; 00336 }; 00337 }; 00338 00339 ostream& operator<<(ostream&, const FieldValue&); 00340 00341 00342 00343 //############################## CLASS SCHEMA ############################ 00357 class Schema : public vector<Field> 00358 { 00359 typedef vector<Field> inherited; 00360 typedef Field T; 00361 00362 public: 00364 Schema() : inherited() {} 00365 Schema(size_type n, const T& value) : inherited(n, value) {} 00366 Schema(int n, const T& value) : inherited(n, value) {} 00367 Schema(long n, const T& value) : inherited(n, value) {} 00368 explicit Schema(size_type n) : inherited(n) {} 00369 Schema(const Schema& x) : inherited(x) {} 00370 00371 #ifdef __STL_MEMBER_TEMPLATES 00372 template <class InputIterator> 00373 Schema(InputIterator first, InputIterator last) 00374 : inherited(first, last) {} 00375 #else 00376 Schema(const_iterator first, const_iterator last) 00377 : inherited(first, last) {} 00378 #endif 00380 00381 00382 00383 00388 bool findColumn(const string& name, int& position, int& start, 00389 int& precision) const; 00390 00392 FieldPtr findColumn(int position) const; 00393 00395 FieldPtr findColumn(const string& name) const; 00396 00401 FieldPtr operator()(int position) const { 00402 return findColumn(position); 00403 } 00404 00405 FieldPtr operator()(const string& name) const { 00406 return findColumn(name); 00407 } 00408 }; 00409 00410 00411 00412 //######################### CLASS SIMPLEDBINDEXKEY ####################### 00419 template <class KeyType> 00420 class SimpleDBIndexKey { 00421 public: 00432 typedef KeyType ByteArr; 00433 typedef typename ByteArr::iterator iterator; 00434 00435 SimpleDBIndexKey() {} 00436 00437 explicit SimpleDBIndexKey(size_t len) 00438 : raw(len, '\0') {} 00439 00440 SimpleDBIndexKey(const unsigned char* the_raw, size_t len) 00441 : raw(len, '\0') 00442 { 00443 copy(the_raw, the_raw+len, begin()); 00444 } 00445 00446 SimpleDBIndexKey(const ByteArr& the_raw) 00447 : raw(the_raw) {} 00448 00459 operator char*() const { 00460 return (char*)(&raw[0]); 00461 } 00462 00463 size_t byteLength() const { 00464 return raw.size(); 00465 } 00466 00468 bool operator==(const SimpleDBIndexKey& other) const { 00469 return raw == other.raw; 00470 } 00471 00472 bool operator!=(const SimpleDBIndexKey& other) const { 00473 return raw != other.raw; 00474 } 00475 00476 void resize(size_t len) { 00477 raw.resize(len); 00478 } 00479 00480 typename ByteArr::iterator begin() { 00481 return raw.begin(); 00482 } 00483 00484 typename ByteArr::iterator end(){ 00485 return raw.end(); 00486 } 00487 00488 private: 00489 ByteArr raw; 00490 }; 00491 00492 00493 00494 //############################# CLASS SIMPLEDB ######################### 00503 template <class KeyType = TinyVector<unsigned char, 8>, 00504 class QueryResult = TinyVector<unsigned int, 4> > 00505 class SimpleDB 00506 { 00509 00510 public: 00512 00515 typedef unsigned long RowNumber; 00516 enum { 00517 InvalidRow = ULONG_MAX 00518 }; 00519 00521 typedef unsigned long Offset; 00522 00524 enum AccessType { 00525 readwrite = 0, 00526 readonly = 1 00527 }; 00528 00531 static const Offset AbsoluteFileLimit = 512ul * 1024ul * 1024ul - 1; 00532 00534 typedef QueryResult QueryResult_t; 00535 static QueryResult EmptyResult; 00536 00538 typedef SimpleDBIndexKey<KeyType> IndexKey; 00539 typedef Hash<IndexKey,QueryResult> Index; 00540 typedef PP<Index> PIndex; 00541 00542 public: 00543 00545 SimpleDB(string rootname, string path=".", AccessType = readwrite, 00546 bool verbose=true); 00547 virtual ~SimpleDB(); 00548 00549 00551 string getName() const { 00552 return name; 00553 } 00554 string getPath() const { 00555 return path; 00556 } 00557 00558 00560 void setSchema(const Schema& s); 00561 00562 const Schema& getSchema() const { 00563 return schema; 00564 } 00565 00566 void saveSchema(); 00567 void loadSchema(); 00568 00573 bool findColumn(string name, int& position, int& start, 00574 int& precision) const { 00575 return schema.findColumn(name,position,start,precision); 00576 } 00577 00580 int indexOfField(const string& fieldname) const { 00581 return schema(fieldname).field_index(); 00582 } 00583 00585 Row getRow(RowNumber) const; 00586 Row& getInRow(RowNumber, Row&) const; 00587 RowNumber size() const { return size_; } 00588 00589 int length() const { 00590 return int(size()); 00591 } 00592 int width() const { 00593 return int(schema.size()); 00594 } 00595 00597 void addRow(const Row&); 00598 void setRow(const Row&, RowNumber); 00599 void truncateFromRow(RowNumber n); 00600 00601 00602 00604 00611 bool indexColumn(string columnName, 00612 string secondColumn = string("")); 00613 00615 void clearIndex(string columnName); 00616 00624 QueryResult findEqual(const unsigned char* lookfor, 00625 string columnName, 00626 string secondColumn = string("")); 00627 00629 const QueryResult& findEqualIndexed(const unsigned char* lookfor, 00630 string columnName, 00631 string secondColumn = string("")); 00632 00634 QueryResult findEqualLinear(const unsigned char* lookfor, 00635 string columnName, 00636 string secondColumn = string("")); 00637 00639 typedef vector<const unsigned char*> vuc; 00640 QueryResult findEqualLinear(const vuc& lookfor, 00641 string columnName, 00642 string secondColumn = string("")); 00643 00645 double tableSizeMultiplier() const { 00646 return table_size_multiplier; 00647 } 00648 void tableSizeMultiplier(double x) { 00649 table_size_multiplier = x; 00650 } 00651 00653 00654 private: 00656 void computeSize(); 00657 00660 void memoryToDisk(Row&) const; 00661 00664 void diskToMemory(Row&) const; 00665 00670 int seekToRow(RowNumber) const; 00671 00673 int seekToEnd() const; 00674 00676 void openAllFiles() const; 00677 00679 void closeAllFiles() const; 00680 00682 inline int lastSegment() const; 00683 00685 string getSegmentPath(int i) const; 00686 00687 private: 00689 string name; 00690 string path; 00691 AccessType access_type; 00692 int access_mask; 00693 RowNumber size_; 00694 00695 00697 Schema schema; 00698 int row_size; 00699 RowNumber max_records_file; 00700 00701 00702 mutable vector<int> allfd; 00703 00704 00706 00711 double table_size_multiplier; 00712 00715 vector<PIndex> indexes; 00716 00718 bool verbose; 00719 00720 private: 00722 SimpleDB(const SimpleDB&); 00723 void operator=(const SimpleDB&); 00724 }; 00725 00726 00727 //########################### CLASS ROWITERATOR ########################## 00738 // *myit1 = *myit2; 00739 00740 class FieldRowRef; 00741 00742 class RowIterator { 00743 public: 00745 RowIterator() : curfield(0), curptr(0), schema(0) { } 00746 RowIterator(const RowIterator& x) { 00747 curfield = x.curfield; 00748 curptr = x.curptr; 00749 schema = x.schema; 00750 } 00751 bool operator==(const RowIterator& x) { 00752 return 00753 curfield==x.curfield && 00754 curptr ==x.curptr && 00755 schema ==x.schema; 00756 } 00757 bool operator!=(const RowIterator& x) { 00758 return !((*this) == x); 00759 } 00760 00762 RowIterator(int curf, unsigned char* curp, const Schema* sc) 00763 : curfield(curf), curptr(curp), schema(sc) 00764 { } 00765 00767 00769 RowIterator& operator=(const RowIterator& x) 00770 { 00771 if (&x != this) 00772 { 00773 curfield = x.curfield; 00774 curptr = x.curptr; 00775 schema = x.schema; 00776 } 00777 return *this; 00778 } 00779 00781 void copyFrom(const RowIterator& it) 00782 { 00783 #ifdef BOUNDCHECK 00784 if(it.precision()!=precision() || it.getFieldType()!=getFieldType()) 00785 PLERROR("In Row::iterator::copyFrom Source and destination fields not of same type or precision"); 00786 #endif 00787 copy(it.raw(),it.raw()+it.precision(),raw()); 00788 } 00789 00791 inline FieldRowRef operator*() const; 00792 00794 00800 RowIterator& operator++() { 00801 if (schema && curptr && curfield < schema->size()) { 00802 curptr += (*schema)[curfield].precision; 00803 ++curfield; 00804 } 00805 return *this; 00806 } 00807 00809 RowIterator operator++(int) { 00810 RowIterator x = *this; 00811 ++(*this); 00812 return x; 00813 } 00814 00817 RowIterator operator[](int i) { 00818 assert(i >= 0); 00819 RowIterator it = *this; 00820 while (i--) 00821 ++it; 00822 return it; 00823 } 00824 00825 FieldType getFieldType() const 00826 { return (*schema)[curfield].field_type; } 00827 00830 00831 bool isString() const { 00832 return schema && curptr && curfield < schema->size() && 00833 (*schema)[curfield].field_type == StringType; 00834 } 00835 00836 bool isCharacter() const { 00837 return schema && curptr && curfield < schema->size() && 00838 (*schema)[curfield].field_type == CharacterType; 00839 } 00840 00841 bool isSignedChar() const { 00842 return schema && curptr && curfield < schema->size() && 00843 (*schema)[curfield].field_type == SignedCharType; 00844 } 00845 00846 bool isShort() const { 00847 return schema && curptr && curfield < schema->size() && 00848 (*schema)[curfield].field_type == ShortType; 00849 } 00850 00851 bool isInt() const { 00852 return schema && curptr && curfield < schema->size() && 00853 (*schema)[curfield].field_type == IntType; 00854 } 00855 00856 bool isFloat() const { 00857 return schema && curptr && curfield < schema->size() && 00858 (*schema)[curfield].field_type == FloatType; 00859 } 00860 00861 bool isDouble() const { 00862 return schema && curptr && curfield < schema->size() && 00863 (*schema)[curfield].field_type == DoubleType; 00864 } 00865 00866 bool isDate() const { 00867 return schema && curptr && curfield < schema->size() && 00868 (*schema)[curfield].field_type == DateType; 00869 } 00870 00871 00877 char* asString() { 00878 bool iss = isString(); 00879 if (iss) 00880 return reinterpret_cast<char*>(curptr); 00881 else 00882 return 0; 00883 //return isString()? reinterpret_cast<char*>(curptr) : 0; 00884 } 00885 00886 unsigned char* asCharacter() { 00887 return isCharacter()? 00888 reinterpret_cast<unsigned char*>(curptr) : 0; 00889 } 00890 00891 signed char* asSignedChar() { 00892 return isSignedChar()? 00893 reinterpret_cast<signed char*>(curptr) : 0; 00894 } 00895 00896 short* asShort() { 00897 return isShort()? reinterpret_cast<short*>(curptr) : 0; 00898 } 00899 00900 int* asInt() { 00901 return isInt()? reinterpret_cast<int*>(curptr) : 0; 00902 } 00903 00904 float* asFloat() { 00905 return isFloat()? reinterpret_cast<float*>(curptr) : 0; 00906 } 00907 00908 double* asDouble() { 00909 return isDouble()? reinterpret_cast<double*>(curptr) : 0; 00910 } 00911 00912 PDate* asDate() { 00913 return isDate()? reinterpret_cast<PDate*>(curptr) : 0; 00914 } 00915 00916 00923 const char* asString() const { 00924 return isString()? reinterpret_cast<const char*>(curptr) : 0; 00925 } 00926 00927 const unsigned char* asCharacter() const { 00928 return isCharacter()? 00929 reinterpret_cast<const unsigned char*>(curptr) : 0; 00930 } 00931 00932 const signed char* asSignedChar() const { 00933 return isSignedChar()? 00934 reinterpret_cast<const signed char*>(curptr) : 0; 00935 } 00936 00937 const short* asShort() const { 00938 return isShort()? reinterpret_cast<const short*>(curptr) : 0; 00939 } 00940 00941 const int* asInt() const { 00942 return isInt()? reinterpret_cast<const int*>(curptr) : 0; 00943 } 00944 00945 const float* asFloat() const { 00946 return isFloat()? reinterpret_cast<const float*>(curptr) : 0; 00947 } 00948 00949 const double* asDouble() const { 00950 return isDouble()? reinterpret_cast<const double*>(curptr) : 0; 00951 } 00952 00953 const PDate* asDate() const { 00954 return isDate()? reinterpret_cast<const PDate*>(curptr) : 0; 00955 } 00956 00957 00960 double toDouble() const; 00961 string toString() const; 00962 00963 00965 bool isMissing() const; 00966 void setMissing(); 00967 00968 00971 string name() const { 00972 return (schema && curfield < schema->size())? 00973 (*schema)[curfield].name : string(""); 00974 } 00975 00978 int precision() const { 00979 return (schema && curfield < schema->size())? 00980 (*schema)[curfield].precision : -1; 00981 } 00982 00985 int char_width() const; 00986 00988 unsigned char* raw() { 00989 return curptr; 00990 } 00991 00992 const unsigned char* raw() const { 00993 return curptr; 00994 } 00995 01000 FieldPtr makePtr() const { 01001 return schema->findColumn(curfield); 01002 } 01003 01004 private: 01005 unsigned curfield; 01006 unsigned char* curptr; 01007 const Schema* schema; 01008 }; 01009 01010 01011 //########################### CLASS FIELDROWREF ########################## 01018 class FieldRowRef 01019 { 01020 public: 01021 FieldRowRef(const RowIterator& it) 01022 : it_(it) {} 01023 01025 operator FieldValue() const; 01026 FieldRowRef& operator=(const FieldValue&); 01027 FieldRowRef& operator=(const FieldRowRef rhs) { 01028 return operator=(FieldValue(rhs)); 01029 } 01030 01032 inline RowIterator operator&() const; 01033 01034 private: 01035 RowIterator it_; 01036 01037 }; 01038 01039 01040 //############################### CLASS ROW ############################ 01047 class Row { 01048 01049 public: 01051 typedef RowIterator iterator; 01052 typedef iterator const_iterator; 01053 typedef long size_type; 01054 01055 public: 01060 Row() : rawrow(), schema(0) { } 01061 Row(const Row& r) : rawrow(r.rawrow), schema(r.schema) { } 01062 01064 Row(const Schema* s); 01065 01068 Row(const vector<unsigned char>& raw, const Schema* s) 01069 : rawrow(raw), schema(s) { } 01070 01071 01073 01074 01076 iterator begin() { 01077 return iterator(0, raw(), schema); 01078 } 01079 01080 iterator end() { 01081 if (schema) 01082 return iterator(schema->size(), raw()+rawrow.size(), 01083 schema); 01084 else 01085 return iterator(0,0,0); 01086 } 01087 01089 size_type size() const { 01090 return (size_type)rawrow.size(); 01091 } 01092 01093 size_type max_size() const { 01094 return (size_type)rawrow.size(); 01095 } 01096 01097 bool empty() const { 01098 return (schema && schema->empty()) || !schema; 01099 } 01100 01102 const unsigned char* raw() const { 01103 if (rawrow.size()) 01104 return &rawrow[0]; 01105 else 01106 return 0; 01107 } 01108 01109 unsigned char* raw() { 01110 if (rawrow.size()) 01111 return &rawrow[0]; 01112 else 01113 return 0; 01114 } 01115 01116 const Schema* getSchema() const { 01117 return schema; 01118 } 01119 01127 iterator operator[](int fieldNumber); 01128 iterator operator[](string fieldName); 01129 01135 iterator bind(const FieldPtr& p) const { 01136 if (!p) 01137 PLERROR("Trying to dereference a null FieldPtr"); 01138 return iterator(p.field_index_, 01139 const_cast<unsigned char*>(raw()) + p.offset_, 01140 schema); 01141 } 01142 01147 void sanitize() const; 01148 01149 private: 01150 vector<unsigned char> rawrow; 01151 const Schema* schema; 01152 }; 01153 01154 01155 //##### Row-related global functions #################################### 01156 01158 double todouble(const Row::iterator& it); 01159 string tostring(const Row::iterator& it); 01160 01163 ostream& operator<<(ostream& o, const Row::iterator& field); 01164 01166 ostream& operator<<(ostream&, const Row& row); 01167 01169 void printFieldName(ostream& o, const Row::iterator& field); 01170 01172 void printFieldNames(ostream& o, const Row& row); 01173 01174 01175 //##### Miscellaneous Declarations ######################################## 01176 01178 typedef SimpleDB<> SDB; 01179 01182 void randomShuffleRows(SDB& sdb); 01183 01186 void halfShuffleRows(SDB& sdb); 01187 01188 01189 01190 //##### Non-Template Inline Functions ##################################### 01191 01192 FieldRowRef RowIterator::operator*() const 01193 { 01194 return FieldRowRef(*this); 01195 } 01196 01197 RowIterator FieldRowRef::operator&() const 01198 { 01199 return it_; 01200 } 01201 01202 01203 01204 //##### Implementation of Templates ####################################### 01205 01206 template <class KT, class QR> 01207 QR SimpleDB<KT,QR>::EmptyResult; 01208 01209 01210 01211 //##### SimpleDB-related functions ######################################## 01212 01213 template <class KT, class QR> 01214 SimpleDB<KT,QR>::SimpleDB(string rootname, string the_path, 01215 AccessType the_access_type, bool the_verbose) 01216 : name(rootname), path(the_path), access_type(the_access_type), 01217 access_mask(0), schema(), row_size(), allfd(), 01218 table_size_multiplier(1.8), indexes(), verbose(the_verbose) 01219 { 01220 if (path != "") 01221 path += slash; 01222 string fullpath = path + name + ".sdb"; 01223 01224 switch (access_type) { 01225 case readwrite: 01226 access_mask = O_RDWR; 01227 break; 01228 case readonly: 01229 access_mask = O_RDONLY; 01230 break; 01231 } 01232 01233 loadSchema(); 01234 openAllFiles(); 01235 computeSize(); 01236 } 01237 01238 template <class KT, class QR> 01239 SimpleDB<KT,QR>::~SimpleDB() 01240 { 01242 closeAllFiles(); 01243 saveSchema(); 01244 } 01245 01246 template <class KT, class QR> 01247 void SimpleDB<KT,QR>::setSchema(const Schema& s) 01248 { 01249 schema = s; 01250 Row row(&s); 01251 row_size = row.size(); 01252 indexes.resize(s.size()); 01253 01256 if (row_size > 0) 01257 max_records_file = RowNumber(AbsoluteFileLimit / row_size); 01258 else 01259 max_records_file = 0; 01260 01262 01264 closeAllFiles(); 01265 openAllFiles(); 01266 computeSize(); 01267 } 01268 01269 template <class KT, class QR> 01270 void SimpleDB<KT,QR>::saveSchema() 01271 { 01272 if (access_type==readwrite) { 01273 string fullpath = path + name + ".ssc"; 01274 ofstream sf(fullpath.c_str(), ios::out); 01275 Schema::iterator it = schema.begin(), end = schema.end(); 01276 for (; it != end; ++it) { 01277 sf << it->name << " "; 01278 switch (it->field_type) { 01279 case Unknown: 01280 break; 01281 case StringType: 01282 sf << "string " << it->precision << endl; 01283 break; 01284 01285 case CharacterType: 01286 sf << "character" << endl; 01287 break; 01288 01289 case SignedCharType: 01290 sf << "signedchar" << endl; 01291 break; 01292 01293 case ShortType: 01294 sf << "short" << endl; 01295 break; 01296 01297 case IntType: 01298 sf << "int" << endl; 01299 break; 01300 01301 case FloatType: 01302 sf << "float" << endl; 01303 break; 01304 01305 case DoubleType: 01306 sf << "double" << endl; 01307 break; 01308 01309 case DateType: 01310 sf << "date" << endl; 01311 break; 01312 01313 default: 01314 PLERROR("Unknown field type in database: %d", it->field_type); 01315 } 01316 } 01317 } 01318 } 01319 01320 template <class KT, class QR> 01321 void SimpleDB<KT,QR>::loadSchema() 01322 { 01325 string fullpath = path + name + ".ssc"; 01326 ifstream sf(fullpath.c_str()); 01327 Schema schema; 01328 while (sf) { 01329 string name,type; 01330 sf >> name >> type; 01331 if (name.size() == 0 || type.size() == 0) 01332 break; 01333 type = lowerstring(type); 01334 if (type == "string") { 01335 int length; 01336 sf >> length; 01337 schema.push_back(Field(name,StringType,length)); 01338 } 01339 else if (type == "character") 01340 schema.push_back(Field(name,CharacterType)); 01341 else if (type == "signedchar") 01342 schema.push_back(Field(name,SignedCharType)); 01343 else if (type == "short") 01344 schema.push_back(Field(name,ShortType)); 01345 else if (type == "int") 01346 schema.push_back(Field(name,IntType)); 01347 else if (type == "float") 01348 schema.push_back(Field(name,FloatType)); 01349 else if (type == "double") 01350 schema.push_back(Field(name,DoubleType)); 01351 else if (type == "date") 01352 schema.push_back(Field(name,DateType)); 01353 else { 01354 cerr << "Unexpected input type \"" << type 01355 << "\" in schema file " << fullpath << endl; 01356 exit(1); 01357 } 01358 } 01359 setSchema(schema); 01360 } 01361 01362 template <class KT, class QR> 01363 void SimpleDB<KT,QR>::addRow(const Row& row) 01364 { 01365 if(row_size != row.size()) 01366 PLERROR("In addRow row_size != row.size() (%d != %d)", row_size, row.size()); 01367 row.sanitize(); 01368 int fd = seekToEnd(); 01369 off_t curpos = lseek(fd, 0L, SEEK_CUR); 01370 01371 #ifdef LITTLEENDIAN 01372 01373 //ssize_t writtensize = ::write(fd, row.raw(), row_size); 01374 int writtensize = ::write(fd, row.raw(), row_size); 01375 #endif 01376 #ifdef BIGENDIAN 01377 Row newrow(row); 01378 memoryToDisk(newrow); 01379 int writtensize = ::write(fd, newrow.raw(), row_size); 01380 #endif 01381 01383 if (writtensize == -1) { 01386 #if defined(_MINGW_) || defined(WIN32) 01387 PLWARNING("could not truncate database file, end may be corrupted!"); 01388 #else 01389 ftruncate(fd, curpos); 01390 #endif 01391 PLERROR("Could not write to database: %s", strerror(errno)); 01392 } 01393 else 01394 size_++; 01395 } 01396 01397 template <class KT, class QR> 01398 void SimpleDB<KT,QR>::setRow(const Row& row, RowNumber n) 01399 { 01400 if(n<0 || n>=size()) 01401 PLERROR("Out of bounds in SimpleDB::setRow"); 01402 if(row_size != row.size()) 01403 PLERROR("In setRow row_size != row.size() (%d != %d)", row_size, row.size()); 01404 row.sanitize(); 01405 int fd = seekToRow(n); 01406 01407 #ifdef LITTLEENDIAN 01408 int writtensize = ::write(fd, row.raw(), row_size); 01409 #endif 01410 #ifdef BIGENDIAN 01411 Row newrow(row); 01412 memoryToDisk(newrow); 01413 int writtensize = ::write(fd, newrow.raw(), row_size); 01414 #endif 01415 01417 if (writtensize == -1) 01418 PLERROR("Could not write to database: %s",strerror(errno)); 01419 } 01420 01421 01422 template <class KT, class QR> 01423 void SimpleDB<KT,QR>::truncateFromRow(RowNumber n) 01424 { 01430 int curfd = seekToRow(n); 01431 off_t curpos = lseek(curfd, 0L, SEEK_CUR); 01432 if (ftruncate(curfd, curpos) == -1) { 01433 PLERROR((string("Could not truncate database at row ") + 01434 tostring(n) + ": " + strerror(errno)).c_str()); 01435 } 01436 01437 vector<int>::iterator found = find(allfd.begin(), allfd.end(), curfd); 01438 int fromfd = found-allfd.begin() + 1; 01439 int last = lastSegment(); 01440 01441 closeAllFiles(); 01442 bool allok = true; 01443 01444 for ( ; fromfd <= last; ++fromfd) { 01445 string path = getSegmentPath(fromfd); 01446 if(unlink(path.c_str()) == -1) { 01447 PLWARNING((string("Could not unlink database segment ") + path + 01448 ": " + strerror(errno)).c_str()); 01449 allok = false; 01450 } 01451 } 01452 01453 if (allok) { 01454 openAllFiles(); 01455 computeSize(); 01456 } 01457 else 01458 PLERROR("Error during truncation"); 01459 } 01460 01461 01462 template <class KT, class QR> 01463 Row& SimpleDB<KT,QR>::getInRow(RowNumber n, Row& row) const 01464 { 01465 if(n<0 || n>=size()) 01466 PLERROR("Out of Bounds in SimpleDB::getInRow"); 01467 if(row_size != row.size()) 01468 PLERROR("In getInRow row_size!=row_size()"); 01469 int fd = seekToRow(n); 01470 01471 int size_read = ::read(fd, row.raw(), row_size); 01472 if (size_read == -1) 01473 PLERROR("Could not read from database: %s",strerror(errno)); 01474 diskToMemory(row); 01475 return row; 01476 } 01477 01478 template <class KT, class QR> 01479 Row SimpleDB<KT,QR>::getRow(RowNumber n) const 01480 { 01481 Row row(&schema); 01482 getInRow(n, row); 01483 return row; 01484 } 01485 01486 template <class KT, class QR> 01487 void SimpleDB<KT,QR>::computeSize() 01488 { 01489 if(row_size<=0) 01490 size_ = 0; 01491 else 01492 { 01493 size_ = 0; 01494 int i=0; 01495 int bytesinfile = file_size(getSegmentPath(i++)); 01496 while(bytesinfile>0) 01497 { 01498 size_ += bytesinfile/row_size; 01499 bytesinfile = file_size(getSegmentPath(i++)); 01500 } 01501 } 01502 01505 01518 } 01519 01520 template <class KT, class QR> 01521 void SimpleDB<KT,QR>::memoryToDisk(Row& row) const 01522 { 01523 #ifdef LITTLEENDIAN 01524 01525 #endif 01526 #ifdef BIGENDIAN 01527 Row newr(row); 01528 Row::iterator it = newr.begin(), end = newr.end(); 01529 for(; it != end; ++it) { 01531 if (short* x = it.asShort()) 01532 reverse_short(x,1); 01533 if (int* x = it.asInt()) 01534 reverse_int(x,1); 01535 if (float* x = it.asFloat()) 01536 reverse_float(x,1); 01537 if (double* x = it.asDouble()) 01538 reverse_double(x,1); 01539 if (PDate* x = it.asDate()) { 01540 reverse_short(&(x->year),1); 01541 } 01542 } 01543 #endif 01544 } 01545 01546 template <class KT, class QR> 01547 void SimpleDB<KT,QR>::diskToMemory(Row& row) const 01548 { 01549 memoryToDisk(row); 01550 } 01551 01552 01553 //##### Physical Splitting Among Multiple Files ######################### 01554 01555 template <class KT, class QR> 01556 int SimpleDB<KT,QR>::seekToRow(RowNumber i) const 01557 { 01559 01560 if (max_records_file == 0) 01561 PLERROR("Attempting to seekToRow without schema set"); 01562 01563 int segmentNumber = int(i / max_records_file); 01564 Offset rowInSegment = Offset(i % max_records_file); 01565 01567 if (segmentNumber > lastSegment()) { 01568 for (int i = lastSegment()+1; i <= segmentNumber; ++i) { 01569 int fd = open(getSegmentPath(i).c_str(), 01570 access_mask | O_CREAT, 0777); 01571 if (fd == -1) 01572 PLERROR("Could not open database segment %d at path %s: %s", 01573 i, getSegmentPath(i).c_str(), strerror(errno)); 01574 allfd.push_back(fd); 01575 } 01576 } 01577 if (allfd[segmentNumber] == -1) 01578 PLERROR("Problem accessing database segment %d at path %s", 01579 segmentNumber, getSegmentPath(segmentNumber).c_str()); 01580 01581 if (lseek(allfd[segmentNumber], 01582 rowInSegment * Offset(row_size), SEEK_SET)<0) 01583 PLERROR("problem in lseek: %s",strerror(errno)); 01584 return allfd[segmentNumber]; 01585 } 01586 01587 01588 template <class KT, class QR> 01589 int SimpleDB<KT,QR>::seekToEnd() const 01590 { 01592 01593 if (max_records_file == 0) 01594 PLERROR("Attempting to seekToEnd without schema set"); 01595 01596 int last = lastSegment(); 01597 int fd = allfd[last]; 01598 if (fd == -1) 01599 PLERROR("Problem accessing database segment %d at path %s", 01600 last, getSegmentPath(last).c_str()); 01601 01602 off_t pos = lseek(fd, 0ul, SEEK_END); 01603 01606 if (Offset(pos) / Offset(row_size) >= max_records_file) 01607 fd = seekToRow((last+1)*max_records_file); 01608 01609 return fd; 01610 } 01611 01612 01613 template <class KT, class QR> 01614 void SimpleDB<KT,QR>::openAllFiles() const 01615 { 01621 closeAllFiles(); 01622 01623 int fd; 01624 fd = open(getSegmentPath(0).c_str(), 01625 access_mask | O_CREAT, 0777); 01626 if (fd == -1) 01627 PLERROR("Could not open main database segment %s: %s", 01628 getSegmentPath(0).c_str(), strerror(errno)); 01629 allfd.push_back(fd); 01630 01631 int index = 1; 01632 for ( ; ; ++index ) { 01633 fd = open(getSegmentPath(index).c_str(), 01634 access_mask); 01635 if (fd == -1) 01636 break; 01637 else 01638 allfd.push_back(fd); 01639 } 01640 } 01641 01642 01643 template <class KT, class QR> 01644 void SimpleDB<KT,QR>::closeAllFiles() const 01645 { 01646 vector<int>::iterator it=allfd.begin(), end=allfd.end(); 01647 for (; it != end; ++it) 01648 if (*it != -1) { 01649 close(*it); 01650 } 01651 allfd.clear(); 01652 } 01653 01654 01655 template <class KT, class QR> 01656 inline int SimpleDB<KT,QR>::lastSegment() const 01657 { 01658 return (int)allfd.size() - 1; 01659 } 01660 01661 01662 template <class KT, class QR> 01663 string SimpleDB<KT,QR>::getSegmentPath(int i) const 01664 { 01665 string fullpath = path + name; 01666 if (i >= 1 && i <= 26) { 01667 string postfix(1, char('a'+i-1)); 01668 fullpath += string("_") + postfix; 01669 } 01670 else if (i > 26) 01671 PLERROR("Too many segments in the database."); 01672 if(fullpath.find(".sdb")==string::npos) 01673 fullpath += ".sdb"; 01674 return fullpath; 01675 } 01676 01677 01678 //##### Indexing- and Query-Related Functions ########################### 01679 01680 template <class KT, class QR> 01681 bool SimpleDB<KT,QR>::indexColumn(string column_name, 01682 string second_column) 01683 { 01684 bool has_second_column = (second_column.size() > 0); 01685 int n, start_pos, column_precision =0, 01686 n2, start_pos2, column_precision2=0; 01687 if (!findColumn(column_name, n, start_pos, column_precision)) 01688 return false; 01689 if (has_second_column && 01690 !findColumn(second_column, n2, start_pos2, column_precision2)) 01691 return false; 01692 01697 RowNumber maxrows = size(); 01698 RowNumber tablesize = RowNumber(table_size_multiplier*maxrows); 01699 if (maxrows <= 0 || tablesize <= 0) { 01700 PLWARNING("SimpleDB::indexColumn: cannot index a database of " 01701 "zero size."); 01702 return false; 01703 } 01704 if (!indexes[n]) { 01705 indexes[n] = new Index(tablesize, true); 01706 indexes[n]->initializeTable((unsigned int)tablesize); 01707 } 01708 Index& index = *indexes[n]; 01709 index.resize(tablesize); 01710 index.flush(); 01711 01712 Row currow(&schema); 01713 IndexKey key(column_precision + column_precision2); 01714 typename IndexKey::iterator keybegin = key.begin(); 01715 unsigned char* begin1 = currow.raw() + start_pos; 01716 unsigned char* end1 = begin1 + column_precision; 01717 unsigned char* begin2 = currow.raw() + start_pos2; 01718 unsigned char* end2 = begin2 + column_precision2; 01719 01720 for(RowNumber i=0; i<maxrows; ++i) { 01721 if (verbose && i % 1000000 == 0) { 01722 unsigned numclusters, maxcluster; 01723 index.diagnostics(numclusters, maxcluster); 01724 cerr << "indexing row " << i 01725 << "\t num. clusters=" << numclusters 01726 << "\t max. cluster size=" << maxcluster 01727 << endl; 01728 } 01729 01730 getInRow(i,currow); 01731 01737 copy(begin1, end1, keybegin); 01738 if (has_second_column) 01739 copy(begin2, end2, keybegin+column_precision-1); 01740 unsigned int addr = index.hashAddress(key); 01741 01742 if (addr == Hash_UNUSED_TAG) { 01748 bool needresize = !index.add(key,QueryResult_t()); 01749 if (needresize) { 01750 cerr << "Hash table unexpectedly full; exiting..." << endl; 01751 exit(1); 01752 } 01753 addr = index.hashAddress(key); 01754 } 01755 01757 QueryResult_t* qr = index[addr]; 01758 try { 01759 qr->push_back(i); 01760 } 01761 catch (logic_error& e) { 01762 cerr << "Exception caught during indexing: " 01763 << typeid(e).name() << endl 01764 << "Containing: " << e.what() << endl; 01765 throw; 01766 } 01767 } 01768 return true; 01769 } 01770 01771 template <class KT, class QR> 01772 void SimpleDB<KT,QR>::clearIndex(string column_name) 01773 { 01774 int n, start_pos, column_precision=0; 01775 if (findColumn(column_name, n, start_pos, column_precision)) 01776 indexes[n] = 0; 01777 } 01778 01779 template <class KT, class QR> 01780 QR SimpleDB<KT,QR>::findEqual(const unsigned char* lookfor, 01781 string column_name, string second_column) 01782 { 01783 int n, start_pos, column_precision; 01784 if (!findColumn(column_name, n, start_pos, column_precision)) 01785 return EmptyResult; 01786 if (indexes[n]) 01787 return findEqualIndexed(lookfor, column_name, second_column); 01788 else 01789 return findEqualLinear(lookfor, column_name, second_column); 01790 } 01791 01792 01793 template <class KT, class QR> 01794 const QR& SimpleDB<KT,QR>::findEqualIndexed(const unsigned char* lookfor, 01795 string column_name, 01796 string second_column) 01797 { 01798 bool has_second_column = (second_column.size() > 0); 01799 int n, start_pos, column_precision =0, 01800 n2, start_pos2, column_precision2=0; 01801 if (!findColumn(column_name, n, start_pos, column_precision)) 01802 return EmptyResult; 01803 if (has_second_column && 01804 !findColumn(second_column, n2, start_pos2, column_precision2)) 01805 return EmptyResult; 01806 01807 if (!indexes[n]) 01808 PLERROR("SimpleDB::indexColumn must be done before performing " 01809 "indexed searches on column %s", column_name.c_str()); 01810 01811 Index& index = *indexes[n]; 01812 IndexKey key(lookfor, column_precision+column_precision2); 01813 unsigned int addr = index.hashAddress(key); 01814 if (addr == Hash_UNUSED_TAG) 01815 return EmptyResult; 01816 else 01817 return *index[addr]; 01818 } 01819 01820 01821 template <class KT, class QR> 01822 QR SimpleDB<KT,QR>::findEqualLinear(const unsigned char* lookfor, 01823 string column_name, 01824 string second_column) 01825 { 01826 vuc lf(1); 01827 lf[0] = lookfor; 01828 return findEqualLinear(lf, column_name, second_column); 01829 } 01830 01831 01832 template <class KT, class QR> 01833 QR SimpleDB<KT,QR>::findEqualLinear( 01834 const vuc& lookfor, 01835 string column_name, string second_column) 01836 { 01837 bool has_second_column = (second_column.size() > 0); 01838 int n, start_pos, column_precision =0, 01839 n2, start_pos2, column_precision2=0; 01840 if (!findColumn(column_name, n, start_pos, column_precision)) 01841 return EmptyResult; 01842 if (has_second_column && 01843 !findColumn(second_column, n2, start_pos2, column_precision2)) 01844 return EmptyResult; 01845 01846 QR qr; 01847 01849 vector<IndexKey> key_lookfor(lookfor.size()); 01850 vuc::const_iterator 01851 lookit, lookbeg = lookfor.begin(), lookend = lookfor.end(); 01852 typename vector<IndexKey>::iterator 01853 keyit, keybeg = key_lookfor.begin(), keyend = key_lookfor.end(); 01854 size_t len = column_precision+column_precision2; 01855 01856 for (lookit=lookbeg, keyit=keybeg ; lookit != lookend ; 01857 ++lookit, ++keyit) { 01858 keyit->resize(len); 01859 copy(*lookit, *lookit+len, keyit->begin()); 01860 } 01861 01862 IndexKey key_dbrow(column_precision+column_precision2); 01863 typename IndexKey::iterator keybegin = key_dbrow.begin(); 01864 01865 RowNumber maxrows = size(); 01866 Row currow(&schema); 01867 unsigned char* begin1 = currow.raw() + start_pos; 01868 unsigned char* end1 = begin1 + column_precision; 01869 unsigned char* begin2 = currow.raw() + start_pos2; 01870 unsigned char* end2 = begin2 + column_precision2; 01871 01873 for (RowNumber i=0; i<maxrows; ++i) { 01874 if (verbose && i % 1000000 == 0) { 01875 cerr << "Searching row " << i << endl; 01876 } 01877 01878 getInRow(i, currow); 01879 01880 copy(begin1, end1, keybegin); 01881 if (has_second_column) 01884 copy(begin2, end2, keybegin+column_precision-1); 01885 01887 for (keyit = keybeg ; keyit != keyend ; ++keyit) 01888 if (*keyit == key_dbrow) { 01889 qr.push_back(i); 01890 if (verbose) 01891 cerr << "Found string \"" << *keyit 01892 << "\" at row " << i << endl; 01893 } 01894 } 01895 01896 return qr; 01897 } 01898 01899 #ifdef WIN32 01900 #undef open 01901 #undef close 01902 #undef lseek 01903 #undef read 01904 #undef write 01905 #undef unlink 01906 #endif 01907 01908 } // end of namespace PLearn 01909 01910 #endif 01911

Generated on Tue Aug 17 16:05:49 2004 for PLearn by doxygen 1.3.7