00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00037
#ifndef SIMPLEDB_H
00038
#define SIMPLEDB_H
00039
00040
00042
#include <limits.h>
00043
#include <string>
00044
#include <vector>
00045
#include <iostream>
00046
00047
#include <stdexcept>
00048
#include <typeinfo>
00049
00050
00051
00052
#ifndef WIN32
00053
#include <unistd.h>
00054
#else
00055
#include <assert.h>
00056
#include <io.h>
00057
#endif
00058
00059
#include <fcntl.h>
00060
#include <errno.h>
00061
#include <stdlib.h>
00062
00063
00064
00066
00067
#include <plearn/base/stringutils.h>
00068
00069
#include <plearn/base/PDate.h>
00070
#include <plearn/math/Hash.h>
00071
#include <plearn/base/TinyVector.h>
00072
00073
#ifdef WIN32
00074
00075
00076
#define open _open
00077
#define close _close
00078
#define lseek _lseek
00079
#define read _read
00080
#define write _write
00081
#define unlink _unlink
00082
#endif
00083
00084
namespace PLearn {
00085
using namespace std;
00086
00088
class Row;
00089
00090
00091
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108 enum FieldType {
00109
Unknown = 0,
00110
StringType,
00111
CharacterType,
00112
SignedCharType,
00113
ShortType,
00114
IntType,
00115
FloatType,
00116
DoubleType,
00117
DateType
00118 };
00119
00120 struct Field {
00121 Field() :
name(),
field_type(
Unknown),
precision() {}
00122 Field(
string name, FieldType t,
int p = 0)
00123 : name(name),
field_type(t)
00124 {
00127
switch(
field_type) {
00128
case Unknown:
precision=0;
break;
00129
case StringType:
precision=p;
break;
00130
case CharacterType:
precision=1;
break;
00131
case SignedCharType:
precision=1;
break;
00132
case ShortType:
precision=2;
break;
00133
case IntType:
precision=4;
break;
00134
case FloatType:
precision=4;
break;
00135
case DoubleType:
precision=8;
break;
00136
case DateType:
precision=4;
break;
00137
default:
00138
PLERROR(
"Unknown field type %d with name %s",
00139
int(
field_type), name.c_str());
00140 }
00141
if (
sizeof(
PDate) != 4)
00142
PLERROR(
"A PLearn PDate must have sizeof equal to 4");
00143 }
00144
00145 bool operator==(
const Field& x)
const {
00146
return name==
x.name &&
00147
field_type ==
x.field_type &&
00148
precision ==
x.precision;
00149 }
00150
00151 string name;
00152 FieldType field_type;
00153 int precision;
00154 };
00155
00157
extern const char MissingString;
00158
extern const unsigned char MissingCharacter;
00159
extern const signed char MissingSignedChar;
00160
extern const short MissingShort;
00161
extern const int MissingInt;
00162
extern const float MissingFloat;
00163
extern const double MissingDouble;
00164
extern const PDate MissingDate;
00165
00166
00167
00180 class FieldPtr {
00181
friend class Row;
00182
friend class Schema;
00183
00184
public:
00186 FieldPtr() :
field_index_(-1),
offset_(-1) {}
00187
00189
00191 int field_index()
const {
00192
return field_index_;
00193 }
00194
00195 ptrdiff_t
offset()
const {
00196
return offset_;
00197 }
00198
00200 operator bool()
const {
00201
return field_index_ >= 0;
00202 }
00203
00205 bool operator!()
const {
00206
return field_index_ == -1;
00207 }
00208
00210 bool operator==(
const FieldPtr& x)
const {
00211
return field_index_ ==
x.field_index_ &&
00212
offset_ ==
x.offset_;
00213 }
00214
00215 bool operator!=(
const FieldPtr& x)
const {
00216
return !(*
this ==
x);
00217 }
00218
00219
private:
00221 FieldPtr(
int fi, ptrdiff_t o) :
field_index_(fi),
offset_(o) {}
00222
00223
private:
00224 int field_index_;
00225 ptrdiff_t
offset_;
00226 };
00227
00228
00229
00236 class FieldValue
00237 {
00238
friend class FieldRowRef;
00239
00240
public:
00242
FieldValue();
00243
FieldValue(
const FieldValue& fv);
00244
~FieldValue();
00245
00247
explicit FieldValue(
const char*);
00248
explicit FieldValue(
unsigned char);
00249
explicit FieldValue(
signed char);
00250
explicit FieldValue(
short);
00251
explicit FieldValue(
int);
00252
explicit FieldValue(
float);
00253
explicit FieldValue(
double);
00254
explicit FieldValue(
const PDate&);
00255
00257
bool isMissing()
const;
00258
void setMissing();
00259
00261
string toString()
const;
00262
double toDouble()
const;
00263
PDate toDate()
const;
00264
00265 operator double()
const {
return toDouble(); }
00266 operator float()
const {
return float(
toDouble()); }
00267 operator int()
const {
return int(
toDouble()); }
00268 operator string()
const {
return toString(); }
00269 operator PDate()
const {
return toDate(); }
00270
00272
00274
FieldValue& operator=(
FieldValue);
00275
00277
bool operator==(
const FieldValue&) const;
00278
bool operator< (const
FieldValue&) const;
00279
00280 bool operator!=(const
FieldValue& rhs)
const {
00281
return !(*
this == rhs);
00282 }
00283
00284 bool operator<=(
const FieldValue& rhs)
const {
00285
return (*
this == rhs) || (*
this < rhs);
00286 }
00287
00288 bool operator> (
const FieldValue& rhs)
const {
00289
return !(*
this <= rhs);
00290 }
00291
00292 bool operator>=(
const FieldValue& rhs)
const {
00293
return !(*
this < rhs);
00294 }
00295
00297
FieldValue operator+(
const FieldValue&) const;
00298
FieldValue operator-(const
FieldValue&) const;
00299
FieldValue operator*(const
FieldValue&) const;
00300
FieldValue operator/(const
FieldValue&) const;
00301
00303
void swap(
FieldValue&);
00304
00306 bool isString()
const {
00307
return field_type_ ==
StringType;
00308 }
00309
00310 bool isIntegral()
const {
00311
return
00312
field_type_ ==
CharacterType ||
field_type_ ==
SignedCharType ||
00313
field_type_ ==
ShortType ||
field_type_ ==
IntType;
00314 }
00315
00316 bool isFloating()
const {
00317
return field_type_ ==
FloatType ||
field_type_ ==
DoubleType;
00318 }
00319
00320 bool isDate()
const {
00321
return field_type_ ==
DateType;
00322 }
00323
00324
private:
00325 FieldType field_type_;
00326 int precision_;
00327 struct DateVal_t {
00328 short year;
00329 unsigned char month,
day;
00330 };
00331
union {
00332 char*
string_val_;
00333 long long_val_;
00334 double double_val_;
00335 struct DateVal_t date_val_;
00336 };
00337 };
00338
00339 ostream&
operator<<(ostream&,
const FieldValue&);
00340
00341
00342
00343
00357 class Schema :
public vector<Field>
00358 {
00359 typedef vector<Field> inherited;
00360 typedef Field T;
00361
00362
public:
00364 Schema() :
inherited() {}
00365 Schema(size_type n,
const T& value) :
inherited(n, value) {}
00366 Schema(
int n,
const T& value) :
inherited(n, value) {}
00367 Schema(
long n,
const T& value) :
inherited(n, value) {}
00368 explicit Schema(size_type n) :
inherited(n) {}
00369 Schema(
const Schema& x) :
inherited(
x) {}
00370
00371
#ifdef __STL_MEMBER_TEMPLATES
00372
template <
class InputIterator>
00373
Schema(InputIterator first, InputIterator last)
00374 : inherited(first, last) {}
00375
#else
00376 Schema(const_iterator first, const_iterator last)
00377
:
inherited(first, last) {}
00378
#endif
00380
00381
00382
00383
00388
bool findColumn(
const string& name,
int& position,
int& start,
00389
int& precision)
const;
00390
00392
FieldPtr findColumn(
int position)
const;
00393
00395
FieldPtr findColumn(
const string& name)
const;
00396
00401 FieldPtr operator()(
int position)
const {
00402
return findColumn(position);
00403 }
00404
00405 FieldPtr operator()(
const string& name)
const {
00406
return findColumn(name);
00407 }
00408 };
00409
00410
00411
00412
00419
template <
class KeyType>
00420 class SimpleDBIndexKey {
00421
public:
00432 typedef KeyType
ByteArr;
00433 typedef typename ByteArr::iterator
iterator;
00434
00435 SimpleDBIndexKey() {}
00436
00437 explicit SimpleDBIndexKey(size_t len)
00438 :
raw(len, '\0') {}
00439
00440 SimpleDBIndexKey(
const unsigned char* the_raw, size_t len)
00441 :
raw(len, '\0')
00442 {
00443
copy(the_raw, the_raw+len,
begin());
00444 }
00445
00446 SimpleDBIndexKey(
const ByteArr& the_raw)
00447 :
raw(the_raw) {}
00448
00459 operator char*()
const {
00460
return (
char*)(&
raw[0]);
00461 }
00462
00463 size_t
byteLength()
const {
00464
return raw.size();
00465 }
00466
00468 bool operator==(
const SimpleDBIndexKey& other)
const {
00469
return raw == other.
raw;
00470 }
00471
00472 bool operator!=(
const SimpleDBIndexKey& other)
const {
00473
return raw != other.
raw;
00474 }
00475
00476 void resize(size_t len) {
00477
raw.resize(len);
00478 }
00479
00480 typename ByteArr::iterator
begin() {
00481
return raw.begin();
00482 }
00483
00484 typename ByteArr::iterator
end(){
00485
return raw.end();
00486 }
00487
00488
private:
00489 ByteArr raw;
00490 };
00491
00492
00493
00494
00503
template <
class KeyType = TinyVector<
unsigned char, 8>,
00504
class QueryResult = TinyVector<
unsigned int, 4> >
00505 class SimpleDB
00506 {
00509
00510
public:
00512
00515 typedef unsigned long RowNumber;
00516
enum {
00517
InvalidRow = ULONG_MAX
00518 };
00519
00521 typedef unsigned long Offset;
00522
00524 enum AccessType {
00525
readwrite = 0,
00526
readonly = 1
00527 };
00528
00531 static const Offset AbsoluteFileLimit = 512ul * 1024ul * 1024ul - 1;
00532
00534 typedef QueryResult
QueryResult_t;
00535
static QueryResult
EmptyResult;
00536
00538 typedef SimpleDBIndexKey<KeyType> IndexKey;
00539 typedef Hash<IndexKey,QueryResult> Index;
00540 typedef PP<Index> PIndex;
00541
00542
public:
00543
00545
SimpleDB(
string rootname,
string path=
".", AccessType = readwrite,
00546
bool verbose=
true);
00547
virtual ~SimpleDB();
00548
00549
00551 string getName()
const {
00552
return name;
00553 }
00554 string getPath()
const {
00555
return path;
00556 }
00557
00558
00560
void setSchema(
const Schema& s);
00561
00562 const Schema&
getSchema()
const {
00563
return schema;
00564 }
00565
00566
void saveSchema();
00567
void loadSchema();
00568
00573 bool findColumn(
string name,
int& position,
int& start,
00574
int& precision)
const {
00575
return schema.
findColumn(name,position,start,precision);
00576 }
00577
00580 int indexOfField(
const string& fieldname)
const {
00581
return schema(fieldname).field_index();
00582 }
00583
00585
Row getRow(RowNumber) const;
00586
Row& getInRow(RowNumber,
Row&) const;
00587 RowNumber size()
const {
return size_; }
00588
00589 int length()
const {
00590
return int(
size());
00591 }
00592 int width()
const {
00593
return int(
schema.size());
00594 }
00595
00597
void addRow(
const Row&);
00598
void setRow(
const Row&, RowNumber);
00599
void truncateFromRow(RowNumber n);
00600
00601
00602
00604
00611
bool indexColumn(
string columnName,
00612
string secondColumn =
string(
""));
00613
00615
void clearIndex(
string columnName);
00616
00624 QueryResult findEqual(
const unsigned char* lookfor,
00625
string columnName,
00626
string secondColumn =
string(
""));
00627
00629
const QueryResult& findEqualIndexed(
const unsigned char* lookfor,
00630
string columnName,
00631
string secondColumn =
string(
""));
00632
00634 QueryResult findEqualLinear(
const unsigned char* lookfor,
00635
string columnName,
00636
string secondColumn =
string(
""));
00637
00639 typedef vector<const unsigned char*> vuc;
00640 QueryResult findEqualLinear(
const vuc& lookfor,
00641
string columnName,
00642
string secondColumn =
string(
""));
00643
00645 double tableSizeMultiplier()
const {
00646
return table_size_multiplier;
00647 }
00648 void tableSizeMultiplier(
double x) {
00649
table_size_multiplier =
x;
00650 }
00651
00653
00654
private:
00656
void computeSize();
00657
00660
void memoryToDisk(
Row&) const;
00661
00664
void diskToMemory(
Row&) const;
00665
00670
int seekToRow(RowNumber) const;
00671
00673
int seekToEnd() const;
00674
00676
void openAllFiles() const;
00677
00679
void closeAllFiles() const;
00680
00682 inline
int lastSegment() const;
00683
00685
string getSegmentPath(
int i) const;
00686
00687 private:
00689 string name;
00690 string path;
00691 AccessType access_type;
00692 int access_mask;
00693 RowNumber size_;
00694
00695
00697 Schema schema;
00698 int row_size;
00699 RowNumber max_records_file;
00700
00701
00702 mutable
vector<
int> allfd;
00703
00704
00706
00711 double table_size_multiplier;
00712
00715 vector<
PIndex> indexes;
00716
00718 bool verbose;
00719
00720 private:
00722
SimpleDB(const
SimpleDB&);
00723
void operator=(const SimpleDB&);
00724 };
00725
00726
00727
00738
00739
00740 class
FieldRowRef;
00741
00742 class
RowIterator {
00743
public:
00745 RowIterator() : curfield(0), curptr(0), schema(0) { }
00746 RowIterator(
const RowIterator& x) {
00747 curfield =
x.curfield;
00748 curptr =
x.curptr;
00749 schema =
x.schema;
00750 }
00751 bool operator==(
const RowIterator& x) {
00752
return
00753 curfield==
x.curfield &&
00754 curptr ==
x.curptr &&
00755 schema ==
x.schema;
00756 }
00757 bool operator!=(
const RowIterator& x) {
00758
return !((*this) ==
x);
00759 }
00760
00762 RowIterator(
int curf,
unsigned char* curp,
const Schema* sc)
00763 : curfield(curf), curptr(curp), schema(sc)
00764 { }
00765
00767
00769 RowIterator& operator=(
const RowIterator& x)
00770 {
00771
if (&
x !=
this)
00772 {
00773 curfield =
x.curfield;
00774 curptr =
x.curptr;
00775 schema =
x.schema;
00776 }
00777
return *
this;
00778 }
00779
00781 void copyFrom(
const RowIterator& it)
00782 {
00783
#ifdef BOUNDCHECK
00784
if(it.
precision()!=precision() || it.
getFieldType()!=getFieldType())
00785
PLERROR(
"In Row::iterator::copyFrom Source and destination fields not of same type or precision");
00786
#endif
00787
copy(it.
raw(),it.
raw()+it.
precision(),
raw());
00788 }
00789
00791
inline FieldRowRef
operator*() const;
00792
00794
00800 RowIterator& operator++() {
00801
if (schema && curptr && curfield < schema->size()) {
00802 curptr += (*schema)[curfield].precision;
00803 ++curfield;
00804 }
00805
return *
this;
00806 }
00807
00809 RowIterator operator++(
int) {
00810 RowIterator
x = *
this;
00811 ++(*this);
00812
return x;
00813 }
00814
00817 RowIterator operator[](
int i) {
00818 assert(i >= 0);
00819 RowIterator it = *
this;
00820
while (i--)
00821 ++it;
00822
return it;
00823 }
00824
00825 FieldType getFieldType()
const
00826
{
return (*schema)[curfield].field_type; }
00827
00830
00831 bool isString()
const {
00832
return schema && curptr && curfield < schema->size() &&
00833 (*schema)[curfield].field_type ==
StringType;
00834 }
00835
00836 bool isCharacter()
const {
00837
return schema && curptr && curfield < schema->size() &&
00838 (*schema)[curfield].field_type ==
CharacterType;
00839 }
00840
00841 bool isSignedChar()
const {
00842
return schema && curptr && curfield < schema->size() &&
00843 (*schema)[curfield].field_type ==
SignedCharType;
00844 }
00845
00846 bool isShort()
const {
00847
return schema && curptr && curfield < schema->size() &&
00848 (*schema)[curfield].field_type ==
ShortType;
00849 }
00850
00851 bool isInt()
const {
00852
return schema && curptr && curfield < schema->size() &&
00853 (*schema)[curfield].field_type ==
IntType;
00854 }
00855
00856 bool isFloat()
const {
00857
return schema && curptr && curfield < schema->size() &&
00858 (*schema)[curfield].field_type ==
FloatType;
00859 }
00860
00861 bool isDouble()
const {
00862
return schema && curptr && curfield < schema->size() &&
00863 (*schema)[curfield].field_type ==
DoubleType;
00864 }
00865
00866 bool isDate()
const {
00867
return schema && curptr && curfield < schema->size() &&
00868 (*schema)[curfield].field_type ==
DateType;
00869 }
00870
00871
00877 char* asString() {
00878
bool iss = isString();
00879
if (iss)
00880
return reinterpret_cast<char*>(curptr);
00881
else
00882
return 0;
00883
00884 }
00885
00886 unsigned char* asCharacter() {
00887
return isCharacter()?
00888 reinterpret_cast<unsigned char*>(curptr) : 0;
00889 }
00890
00891 signed char* asSignedChar() {
00892
return isSignedChar()?
00893 reinterpret_cast<signed char*>(curptr) : 0;
00894 }
00895
00896 short* asShort() {
00897
return isShort()? reinterpret_cast<short*>(curptr) : 0;
00898 }
00899
00900 int* asInt() {
00901
return isInt()? reinterpret_cast<int*>(curptr) : 0;
00902 }
00903
00904 float* asFloat() {
00905
return isFloat()? reinterpret_cast<float*>(curptr) : 0;
00906 }
00907
00908 double* asDouble() {
00909
return isDouble()? reinterpret_cast<double*>(curptr) : 0;
00910 }
00911
00912 PDate* asDate() {
00913
return isDate()? reinterpret_cast<PDate*>(curptr) : 0;
00914 }
00915
00916
00923 const char* asString()
const {
00924
return isString()? reinterpret_cast<const char*>(curptr) : 0;
00925 }
00926
00927 const unsigned char* asCharacter()
const {
00928
return isCharacter()?
00929 reinterpret_cast<const unsigned char*>(curptr) : 0;
00930 }
00931
00932 const signed char* asSignedChar()
const {
00933
return isSignedChar()?
00934 reinterpret_cast<const signed char*>(curptr) : 0;
00935 }
00936
00937 const short* asShort()
const {
00938
return isShort()? reinterpret_cast<const short*>(curptr) : 0;
00939 }
00940
00941 const int* asInt()
const {
00942
return isInt()? reinterpret_cast<const int*>(curptr) : 0;
00943 }
00944
00945 const float* asFloat()
const {
00946
return isFloat()? reinterpret_cast<const float*>(curptr) : 0;
00947 }
00948
00949 const double* asDouble()
const {
00950
return isDouble()? reinterpret_cast<const double*>(curptr) : 0;
00951 }
00952
00953 const PDate* asDate()
const {
00954
return isDate()? reinterpret_cast<const PDate*>(curptr) : 0;
00955 }
00956
00957
00960
double toDouble() const;
00961
string toString() const;
00962
00963
00965
bool isMissing() const;
00966
void setMissing();
00967
00968
00971 string name()
const {
00972
return (schema && curfield < schema->size())?
00973 (*schema)[curfield].name :
string(
"");
00974 }
00975
00978 int precision()
const {
00979
return (schema && curfield < schema->size())?
00980 (*schema)[curfield].precision : -1;
00981 }
00982
00985
int char_width() const;
00986
00988 unsigned char* raw() {
00989
return curptr;
00990 }
00991
00992 const unsigned char*
raw()
const {
00993
return curptr;
00994 }
00995
01000 FieldPtr makePtr()
const {
01001
return schema->
findColumn(curfield);
01002 }
01003
01004
private:
01005 unsigned curfield;
01006 unsigned char* curptr;
01007 const Schema* schema;
01008 };
01009
01010
01011
01018 class FieldRowRef
01019 {
01020
public:
01021 FieldRowRef(
const RowIterator& it)
01022 : it_(it) {}
01023
01025 operator FieldValue() const;
01026
FieldRowRef& operator=(const
FieldValue&);
01027 FieldRowRef& operator=(const
FieldRowRef rhs) {
01028
return operator=(FieldValue(rhs));
01029 }
01030
01032
inline RowIterator operator&() const;
01033
01034 private:
01035 RowIterator it_;
01036
01037 };
01038
01039
01040
01047 class
Row {
01048
01049
public:
01051 typedef RowIterator iterator;
01052 typedef iterator
const_iterator;
01053 typedef long size_type;
01054
01055
public:
01060 Row() : rawrow(), schema(0) { }
01061 Row(
const Row& r) : rawrow(r.rawrow), schema(r.schema) { }
01062
01064 Row(
const Schema* s);
01065
01068 Row(
const vector<unsigned char>& raw,
const Schema* s)
01069 : rawrow(
raw), schema(s) { }
01070
01071
01073
01074
01076 iterator begin() {
01077
return iterator(0,
raw(), schema);
01078 }
01079
01080 iterator end() {
01081
if (schema)
01082
return iterator(schema->size(),
raw()+rawrow.size(),
01083 schema);
01084
else
01085
return iterator(0,0,0);
01086 }
01087
01089 size_type
size()
const {
01090
return (size_type)rawrow.size();
01091 }
01092
01093 size_type max_size()
const {
01094
return (size_type)rawrow.size();
01095 }
01096
01097 bool empty()
const {
01098
return (schema && schema->empty()) || !schema;
01099 }
01100
01102 const unsigned char*
raw()
const {
01103
if (rawrow.size())
01104
return &rawrow[0];
01105
else
01106
return 0;
01107 }
01108
01109 unsigned char*
raw() {
01110
if (rawrow.size())
01111
return &rawrow[0];
01112
else
01113
return 0;
01114 }
01115
01116 const Schema*
getSchema()
const {
01117
return schema;
01118 }
01119
01127 iterator operator[](
int fieldNumber);
01128 iterator operator[](
string fieldName);
01129
01135 iterator bind(
const FieldPtr& p)
const {
01136
if (!p)
01137
PLERROR(
"Trying to dereference a null FieldPtr");
01138
return iterator(p.
field_index_,
01139 const_cast<unsigned char*>(
raw()) + p.
offset_,
01140 schema);
01141 }
01142
01147
void sanitize() const;
01148
01149 private:
01150 vector<
unsigned char> rawrow;
01151 const
Schema* schema;
01152 };
01153
01154
01155
01156
01158
double todouble(const Row::iterator& it);
01159
string tostring(const Row::iterator& it);
01160
01163 ostream& operator<<(ostream& o, const Row::iterator& field);
01164
01166 ostream& operator<<(ostream&, const Row& row);
01167
01169
void printFieldName(ostream& o, const Row::iterator& field);
01170
01172
void printFieldNames(ostream& o, const Row& row);
01173
01174
01175
01176
01178 typedef
SimpleDB<>
SDB;
01179
01182
void randomShuffleRows(SDB& sdb);
01183
01186
void halfShuffleRows(SDB& sdb);
01187
01188
01189
01190
01191
01192 FieldRowRef RowIterator::operator*()
const
01193
{
01194
return FieldRowRef(*
this);
01195 }
01196
01197 RowIterator FieldRowRef::operator&()
const
01198
{
01199
return it_;
01200 }
01201
01202
01203
01204
01205
01206
template <
class KT,
class QR>
01207 QR
SimpleDB<KT,QR>::EmptyResult;
01208
01209
01210
01211
01212
01213
template <
class KT,
class QR>
01214 SimpleDB<KT,QR>::SimpleDB(
string rootname,
string the_path,
01215 AccessType the_access_type,
bool the_verbose)
01216 : name(rootname), path(the_path), access_type(the_access_type),
01217 access_mask(0), schema(), row_size(), allfd(),
01218 table_size_multiplier(1.8), indexes(), verbose(the_verbose)
01219 {
01220
if (
path !=
"")
01221
path +=
slash;
01222
string fullpath =
path +
name +
".sdb";
01223
01224
switch (
access_type) {
01225
case readwrite:
01226
access_mask = O_RDWR;
01227
break;
01228
case readonly:
01229
access_mask = O_RDONLY;
01230
break;
01231 }
01232
01233
loadSchema();
01234
openAllFiles();
01235
computeSize();
01236 }
01237
01238
template <
class KT,
class QR>
01239 SimpleDB<KT,QR>::~SimpleDB()
01240 {
01242
closeAllFiles();
01243
saveSchema();
01244 }
01245
01246
template <
class KT,
class QR>
01247 void SimpleDB<KT,QR>::setSchema(
const Schema& s)
01248 {
01249
schema = s;
01250 Row row(&s);
01251
row_size = row.
size();
01252
indexes.resize(s.size());
01253
01256
if (
row_size > 0)
01257
max_records_file =
RowNumber(
AbsoluteFileLimit /
row_size);
01258
else
01259
max_records_file = 0;
01260
01262
01264
closeAllFiles();
01265
openAllFiles();
01266
computeSize();
01267 }
01268
01269
template <
class KT,
class QR>
01270 void SimpleDB<KT,QR>::saveSchema()
01271 {
01272
if (
access_type==
readwrite) {
01273
string fullpath =
path +
name +
".ssc";
01274 ofstream sf(fullpath.c_str(), ios::out);
01275 Schema::iterator it =
schema.begin(), end =
schema.end();
01276
for (; it != end; ++it) {
01277 sf << it->name <<
" ";
01278
switch (it->field_type) {
01279
case Unknown:
01280
break;
01281
case StringType:
01282 sf <<
"string " << it->precision <<
endl;
01283
break;
01284
01285
case CharacterType:
01286 sf <<
"character" <<
endl;
01287
break;
01288
01289
case SignedCharType:
01290 sf <<
"signedchar" <<
endl;
01291
break;
01292
01293
case ShortType:
01294 sf <<
"short" <<
endl;
01295
break;
01296
01297
case IntType:
01298 sf <<
"int" <<
endl;
01299
break;
01300
01301
case FloatType:
01302 sf <<
"float" <<
endl;
01303
break;
01304
01305
case DoubleType:
01306 sf <<
"double" <<
endl;
01307
break;
01308
01309
case DateType:
01310 sf <<
"date" <<
endl;
01311
break;
01312
01313
default:
01314
PLERROR(
"Unknown field type in database: %d", it->field_type);
01315 }
01316 }
01317 }
01318 }
01319
01320
template <
class KT,
class QR>
01321 void SimpleDB<KT,QR>::loadSchema()
01322 {
01325
string fullpath =
path +
name +
".ssc";
01326 ifstream sf(fullpath.c_str());
01327
Schema schema;
01328
while (sf) {
01329
string name,type;
01330 sf >> name >> type;
01331
if (name.size() == 0 || type.size() == 0)
01332
break;
01333 type =
lowerstring(type);
01334
if (type ==
"string") {
01335
int length;
01336 sf >> length;
01337 schema.push_back(
Field(name,
StringType,length));
01338 }
01339
else if (type ==
"character")
01340 schema.push_back(
Field(name,
CharacterType));
01341
else if (type ==
"signedchar")
01342 schema.push_back(
Field(name,
SignedCharType));
01343
else if (type ==
"short")
01344 schema.push_back(
Field(name,
ShortType));
01345
else if (type ==
"int")
01346 schema.push_back(
Field(name,
IntType));
01347
else if (type ==
"float")
01348 schema.push_back(
Field(name,
FloatType));
01349
else if (type ==
"double")
01350 schema.push_back(
Field(name,
DoubleType));
01351
else if (type ==
"date")
01352 schema.push_back(
Field(name,
DateType));
01353
else {
01354 cerr <<
"Unexpected input type \"" << type
01355 <<
"\" in schema file " << fullpath <<
endl;
01356 exit(1);
01357 }
01358 }
01359 setSchema(schema);
01360 }
01361
01362
template <
class KT,
class QR>
01363 void SimpleDB<KT,QR>::addRow(
const Row& row)
01364 {
01365
if(
row_size != row.
size())
01366
PLERROR(
"In addRow row_size != row.size() (%d != %d)",
row_size, row.
size());
01367 row.
sanitize();
01368
int fd =
seekToEnd();
01369 off_t curpos = lseek(fd, 0L, SEEK_CUR);
01370
01371
#ifdef LITTLEENDIAN
01372
01373
01374
int writtensize = ::write(fd, row.
raw(),
row_size);
01375
#endif
01376
#ifdef BIGENDIAN
01377
Row newrow(row);
01378 memoryToDisk(newrow);
01379
int writtensize = ::write(fd, newrow.
raw(),
row_size);
01380
#endif
01381
01383
if (writtensize == -1) {
01386
#if defined(_MINGW_) || defined(WIN32)
01387
PLWARNING(
"could not truncate database file, end may be corrupted!");
01388
#else
01389
ftruncate(fd, curpos);
01390
#endif
01391
PLERROR(
"Could not write to database: %s", strerror(errno));
01392 }
01393
else
01394
size_++;
01395 }
01396
01397
template <
class KT,
class QR>
01398 void SimpleDB<KT,QR>::setRow(
const Row& row, RowNumber n)
01399 {
01400
if(n<0 || n>=
size())
01401
PLERROR(
"Out of bounds in SimpleDB::setRow");
01402
if(
row_size != row.
size())
01403
PLERROR(
"In setRow row_size != row.size() (%d != %d)",
row_size, row.
size());
01404 row.
sanitize();
01405
int fd =
seekToRow(n);
01406
01407
#ifdef LITTLEENDIAN
01408
int writtensize = ::write(fd, row.
raw(),
row_size);
01409
#endif
01410
#ifdef BIGENDIAN
01411
Row newrow(row);
01412 memoryToDisk(newrow);
01413
int writtensize = ::write(fd, newrow.
raw(),
row_size);
01414
#endif
01415
01417
if (writtensize == -1)
01418
PLERROR(
"Could not write to database: %s",strerror(errno));
01419 }
01420
01421
01422
template <
class KT,
class QR>
01423 void SimpleDB<KT,QR>::truncateFromRow(RowNumber n)
01424 {
01430
int curfd =
seekToRow(n);
01431 off_t curpos = lseek(curfd, 0L, SEEK_CUR);
01432
if (ftruncate(curfd, curpos) == -1) {
01433
PLERROR((
string(
"Could not truncate database at row ") +
01434
tostring(n) +
": " + strerror(errno)).
c_str());
01435 }
01436
01437
vector<int>::iterator found =
find(
allfd.begin(),
allfd.end(), curfd);
01438
int fromfd = found-
allfd.begin() + 1;
01439
int last =
lastSegment();
01440
01441
closeAllFiles();
01442
bool allok =
true;
01443
01444
for ( ; fromfd <= last; ++fromfd) {
01445
string path =
getSegmentPath(fromfd);
01446
if(unlink(path.c_str()) == -1) {
01447
PLWARNING((
string(
"Could not unlink database segment ") + path +
01448
": " + strerror(errno)).
c_str());
01449 allok =
false;
01450 }
01451 }
01452
01453
if (allok) {
01454
openAllFiles();
01455
computeSize();
01456 }
01457
else
01458
PLERROR(
"Error during truncation");
01459 }
01460
01461
01462
template <
class KT,
class QR>
01463 Row&
SimpleDB<KT,QR>::getInRow(RowNumber n, Row& row)
const
01464
{
01465
if(n<0 || n>=
size())
01466
PLERROR(
"Out of Bounds in SimpleDB::getInRow");
01467
if(
row_size != row.
size())
01468
PLERROR(
"In getInRow row_size!=row_size()");
01469
int fd =
seekToRow(n);
01470
01471
int size_read = ::read(fd, row.
raw(),
row_size);
01472
if (size_read == -1)
01473
PLERROR(
"Could not read from database: %s",strerror(errno));
01474
diskToMemory(row);
01475
return row;
01476 }
01477
01478
template <
class KT,
class QR>
01479 Row
SimpleDB<KT,QR>::getRow(RowNumber n)
const
01480
{
01481 Row row(&
schema);
01482
getInRow(n, row);
01483
return row;
01484 }
01485
01486
template <
class KT,
class QR>
01487 void SimpleDB<KT,QR>::computeSize()
01488 {
01489
if(
row_size<=0)
01490
size_ = 0;
01491
else
01492 {
01493
size_ = 0;
01494
int i=0;
01495
int bytesinfile =
file_size(
getSegmentPath(i++));
01496
while(bytesinfile>0)
01497 {
01498
size_ += bytesinfile/
row_size;
01499 bytesinfile =
file_size(
getSegmentPath(i++));
01500 }
01501 }
01502
01505
01518 }
01519
01520
template <
class KT,
class QR>
01521 void SimpleDB<KT,QR>::memoryToDisk(Row& row)
const
01522
{
01523
#ifdef LITTLEENDIAN
01524
01525
#endif
01526
#ifdef BIGENDIAN
01527
Row newr(row);
01528
Row::iterator it = newr.
begin(), end = newr.
end();
01529
for(; it != end; ++it) {
01531
if (
short*
x = it.
asShort())
01532
reverse_short(
x,1);
01533
if (
int*
x = it.
asInt())
01534
reverse_int(
x,1);
01535
if (
float*
x = it.
asFloat())
01536
reverse_float(
x,1);
01537
if (
double*
x = it.
asDouble())
01538
reverse_double(
x,1);
01539
if (
PDate*
x = it.
asDate()) {
01540
reverse_short(&(
x->year),1);
01541 }
01542 }
01543
#endif
01544
}
01545
01546
template <
class KT,
class QR>
01547 void SimpleDB<KT,QR>::diskToMemory(Row& row)
const
01548
{
01549 memoryToDisk(row);
01550 }
01551
01552
01553
01554
01555
template <
class KT,
class QR>
01556 int SimpleDB<KT,QR>::seekToRow(RowNumber i)
const
01557
{
01559
01560
if (
max_records_file == 0)
01561
PLERROR(
"Attempting to seekToRow without schema set");
01562
01563
int segmentNumber =
int(i /
max_records_file);
01564
Offset rowInSegment =
Offset(i % max_records_file);
01565
01567
if (segmentNumber >
lastSegment()) {
01568
for (
int i =
lastSegment()+1; i <= segmentNumber; ++i) {
01569
int fd =
open(
getSegmentPath(i).
c_str(),
01570
access_mask | O_CREAT, 0777);
01571
if (fd == -1)
01572
PLERROR(
"Could not open database segment %d at path %s: %s",
01573 i,
getSegmentPath(i).
c_str(), strerror(errno));
01574
allfd.push_back(fd);
01575 }
01576 }
01577
if (
allfd[segmentNumber] == -1)
01578
PLERROR(
"Problem accessing database segment %d at path %s",
01579 segmentNumber,
getSegmentPath(segmentNumber).
c_str());
01580
01581
if (lseek(
allfd[segmentNumber],
01582 rowInSegment * Offset(
row_size), SEEK_SET)<0)
01583
PLERROR(
"problem in lseek: %s",strerror(errno));
01584
return allfd[segmentNumber];
01585 }
01586
01587
01588
template <
class KT,
class QR>
01589 int SimpleDB<KT,QR>::seekToEnd()
const
01590
{
01592
01593
if (
max_records_file == 0)
01594
PLERROR(
"Attempting to seekToEnd without schema set");
01595
01596
int last =
lastSegment();
01597
int fd =
allfd[last];
01598
if (fd == -1)
01599
PLERROR(
"Problem accessing database segment %d at path %s",
01600 last,
getSegmentPath(last).
c_str());
01601
01602 off_t pos = lseek(fd, 0ul, SEEK_END);
01603
01606
if (
Offset(pos) /
Offset(
row_size) >=
max_records_file)
01607 fd =
seekToRow((last+1)*
max_records_file);
01608
01609
return fd;
01610 }
01611
01612
01613
template <
class KT,
class QR>
01614 void SimpleDB<KT,QR>::openAllFiles()
const
01615
{
01621
closeAllFiles();
01622
01623
int fd;
01624 fd =
open(
getSegmentPath(0).
c_str(),
01625
access_mask | O_CREAT, 0777);
01626
if (fd == -1)
01627
PLERROR(
"Could not open main database segment %s: %s",
01628
getSegmentPath(0).
c_str(), strerror(errno));
01629
allfd.push_back(fd);
01630
01631
int index = 1;
01632
for ( ; ; ++index ) {
01633 fd =
open(
getSegmentPath(index).
c_str(),
01634
access_mask);
01635
if (fd == -1)
01636
break;
01637
else
01638
allfd.push_back(fd);
01639 }
01640 }
01641
01642
01643
template <
class KT,
class QR>
01644 void SimpleDB<KT,QR>::closeAllFiles()
const
01645
{
01646
vector<int>::iterator it=
allfd.begin(), end=
allfd.end();
01647
for (; it != end; ++it)
01648
if (*it != -1) {
01649 close(*it);
01650 }
01651
allfd.clear();
01652 }
01653
01654
01655
template <
class KT,
class QR>
01656 inline int SimpleDB<KT,QR>::lastSegment()
const
01657
{
01658
return (
int)
allfd.size() - 1;
01659 }
01660
01661
01662
template <
class KT,
class QR>
01663 string SimpleDB<KT,QR>::getSegmentPath(
int i)
const
01664
{
01665
string fullpath =
path +
name;
01666
if (i >= 1 && i <= 26) {
01667
string postfix(1,
char(
'a'+i-1));
01668 fullpath +=
string(
"_") + postfix;
01669 }
01670
else if (i > 26)
01671
PLERROR(
"Too many segments in the database.");
01672
if(fullpath.find(
".sdb")==string::npos)
01673 fullpath +=
".sdb";
01674
return fullpath;
01675 }
01676
01677
01678
01679
01680
template <
class KT,
class QR>
01681 bool SimpleDB<KT,QR>::indexColumn(
string column_name,
01682
string second_column)
01683 {
01684
bool has_second_column = (second_column.size() > 0);
01685
int n, start_pos, column_precision =0,
01686 n2, start_pos2, column_precision2=0;
01687
if (!findColumn(column_name, n, start_pos, column_precision))
01688
return false;
01689
if (has_second_column &&
01690 !findColumn(second_column, n2, start_pos2, column_precision2))
01691
return false;
01692
01697
RowNumber maxrows =
size();
01698
RowNumber tablesize =
RowNumber(
table_size_multiplier*maxrows);
01699
if (maxrows <= 0 || tablesize <= 0) {
01700
PLWARNING(
"SimpleDB::indexColumn: cannot index a database of "
01701
"zero size.");
01702
return false;
01703 }
01704
if (!
indexes[n]) {
01705
indexes[n] =
new Index(tablesize,
true);
01706
indexes[n]->initializeTable((
unsigned int)tablesize);
01707 }
01708
Index& index = *
indexes[n];
01709 index.
resize(tablesize);
01710 index.
flush();
01711
01712 Row currow(&
schema);
01713
IndexKey key(column_precision + column_precision2);
01714
typename IndexKey::iterator keybegin = key.
begin();
01715
unsigned char* begin1 = currow.
raw() + start_pos;
01716
unsigned char* end1 = begin1 + column_precision;
01717
unsigned char* begin2 = currow.
raw() + start_pos2;
01718
unsigned char* end2 = begin2 + column_precision2;
01719
01720
for(RowNumber i=0; i<maxrows; ++i) {
01721
if (
verbose && i % 1000000 == 0) {
01722
unsigned numclusters, maxcluster;
01723 index.
diagnostics(numclusters, maxcluster);
01724 cerr <<
"indexing row " << i
01725 <<
"\t num. clusters=" << numclusters
01726 <<
"\t max. cluster size=" << maxcluster
01727 <<
endl;
01728 }
01729
01730
getInRow(i,currow);
01731
01737
copy(begin1, end1, keybegin);
01738
if (has_second_column)
01739
copy(begin2, end2, keybegin+column_precision-1);
01740
unsigned int addr = index.
hashAddress(key);
01741
01742
if (addr ==
Hash_UNUSED_TAG) {
01748
bool needresize = !index.
add(key,
QueryResult_t());
01749
if (needresize) {
01750 cerr <<
"Hash table unexpectedly full; exiting..." <<
endl;
01751 exit(1);
01752 }
01753 addr = index.
hashAddress(key);
01754 }
01755
01757
QueryResult_t* qr = index[addr];
01758
try {
01759 qr->push_back(i);
01760 }
01761
catch (logic_error& e) {
01762 cerr <<
"Exception caught during indexing: "
01763 <<
typeid(e).name() <<
endl
01764 <<
"Containing: " << e.what() <<
endl;
01765
throw;
01766 }
01767 }
01768
return true;
01769 }
01770
01771
template <
class KT,
class QR>
01772 void SimpleDB<KT,QR>::clearIndex(
string column_name)
01773 {
01774
int n, start_pos, column_precision=0;
01775
if (findColumn(column_name, n, start_pos, column_precision))
01776
indexes[n] = 0;
01777 }
01778
01779
template <
class KT,
class QR>
01780 QR
SimpleDB<KT,QR>::findEqual(
const unsigned char* lookfor,
01781
string column_name,
string second_column)
01782 {
01783
int n, start_pos, column_precision;
01784
if (!findColumn(column_name, n, start_pos, column_precision))
01785
return EmptyResult;
01786
if (
indexes[n])
01787
return findEqualIndexed(lookfor, column_name, second_column);
01788
else
01789
return findEqualLinear(lookfor, column_name, second_column);
01790 }
01791
01792
01793
template <
class KT,
class QR>
01794 const QR&
SimpleDB<KT,QR>::findEqualIndexed(
const unsigned char* lookfor,
01795
string column_name,
01796
string second_column)
01797 {
01798
bool has_second_column = (second_column.size() > 0);
01799
int n, start_pos, column_precision =0,
01800 n2, start_pos2, column_precision2=0;
01801
if (!findColumn(column_name, n, start_pos, column_precision))
01802
return EmptyResult;
01803
if (has_second_column &&
01804 !findColumn(second_column, n2, start_pos2, column_precision2))
01805
return EmptyResult;
01806
01807
if (!
indexes[n])
01808
PLERROR(
"SimpleDB::indexColumn must be done before performing "
01809
"indexed searches on column %s", column_name.c_str());
01810
01811
Index& index = *
indexes[n];
01812
IndexKey key(lookfor, column_precision+column_precision2);
01813
unsigned int addr = index.
hashAddress(key);
01814
if (addr ==
Hash_UNUSED_TAG)
01815
return EmptyResult;
01816
else
01817
return *index[addr];
01818 }
01819
01820
01821
template <
class KT,
class QR>
01822 QR
SimpleDB<KT,QR>::findEqualLinear(
const unsigned char* lookfor,
01823
string column_name,
01824
string second_column)
01825 {
01826
vuc lf(1);
01827 lf[0] = lookfor;
01828
return findEqualLinear(lf, column_name, second_column);
01829 }
01830
01831
01832
template <
class KT,
class QR>
01833 QR SimpleDB<KT,QR>::findEqualLinear(
01834
const vuc& lookfor,
01835
string column_name,
string second_column)
01836 {
01837
bool has_second_column = (second_column.size() > 0);
01838
int n, start_pos, column_precision =0,
01839 n2, start_pos2, column_precision2=0;
01840
if (!findColumn(column_name, n, start_pos, column_precision))
01841
return EmptyResult;
01842
if (has_second_column &&
01843 !findColumn(second_column, n2, start_pos2, column_precision2))
01844
return EmptyResult;
01845
01846 QR qr;
01847
01849
vector<IndexKey> key_lookfor(lookfor.size());
01850 vuc::const_iterator
01851 lookit, lookbeg = lookfor.begin(), lookend = lookfor.end();
01852
typename vector<IndexKey>::iterator
01853 keyit, keybeg = key_lookfor.begin(), keyend = key_lookfor.end();
01854 size_t len = column_precision+column_precision2;
01855
01856
for (lookit=lookbeg, keyit=keybeg ; lookit != lookend ;
01857 ++lookit, ++keyit) {
01858 keyit->resize(len);
01859
copy(*lookit, *lookit+len, keyit->begin());
01860 }
01861
01862
IndexKey key_dbrow(column_precision+column_precision2);
01863
typename IndexKey::iterator keybegin = key_dbrow.
begin();
01864
01865
RowNumber maxrows =
size();
01866 Row currow(&
schema);
01867
unsigned char* begin1 = currow.
raw() + start_pos;
01868
unsigned char* end1 = begin1 + column_precision;
01869
unsigned char* begin2 = currow.
raw() + start_pos2;
01870
unsigned char* end2 = begin2 + column_precision2;
01871
01873
for (
RowNumber i=0; i<maxrows; ++i) {
01874
if (
verbose && i % 1000000 == 0) {
01875 cerr <<
"Searching row " << i <<
endl;
01876 }
01877
01878
getInRow(i, currow);
01879
01880
copy(begin1, end1, keybegin);
01881
if (has_second_column)
01884
copy(begin2, end2, keybegin+column_precision-1);
01885
01887
for (keyit = keybeg ; keyit != keyend ; ++keyit)
01888
if (*keyit == key_dbrow) {
01889 qr.push_back(i);
01890
if (
verbose)
01891 cerr <<
"Found string \"" << *keyit
01892 <<
"\" at row " << i <<
endl;
01893 }
01894 }
01895
01896
return qr;
01897 }
01898
01899
#ifdef WIN32
01900
#undef open
01901
#undef close
01902
#undef lseek
01903
#undef read
01904
#undef write
01905
#undef unlink
01906
#endif
01907
01908 }
01909
01910
#endif
01911