BTree realization
authorteodor <teodor>
Fri, 4 Feb 2005 18:58:46 +0000 (18:58 +0000)
committerteodor <teodor>
Fri, 4 Feb 2005 18:58:46 +0000 (18:58 +0000)
Makefile
flatdb.c
gendata.c [new file with mode: 0644]
tbtree.c [new file with mode: 0644]
tbtree.h [new file with mode: 0644]
tbtreetest.c [new file with mode: 0644]

index 7184bef..12e6690 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -9,8 +9,8 @@ LIB=-g -L. -ltedtools -lm
 
 OBJS=tlog.o tmalloc.o tools.o prs_hmap.o sfxstr.o \
        regis.o prs_inf.o shmem.o tcp.o udp.o connpool.o \
 
 OBJS=tlog.o tmalloc.o tools.o prs_hmap.o sfxstr.o \
        regis.o prs_inf.o shmem.o tcp.o udp.o connpool.o \
-       psort.o flatdb.o
-PROGS=sfxtest hextest inftest kilter psortex flatdbtest
+       psort.o flatdb.o tbtree.o
+PROGS=sfxtest hextest inftest kilter psortex flatdbtest tbtreetest gendata
 
 .SUFFIXES: .o.c
 
 
 .SUFFIXES: .o.c
 
@@ -33,5 +33,5 @@ clean:
        rm -rf $(PROGS) *.o
        rm -rf libtedtools.a
        rm -rf *core *gmon* nohup.out
        rm -rf $(PROGS) *.o
        rm -rf libtedtools.a
        rm -rf *core *gmon* nohup.out
-       rm -rf sfxtest.log
+       rm -rf sfxtest.log BTREE
 
 
index 05dbe34..10e1a00 100644 (file)
--- a/flatdb.c
+++ b/flatdb.c
@@ -116,13 +116,21 @@ FDBOpen(FDB *db, char *file, int readonly) {
        if ( readonly ) {
                db->readonly=1;
                db->fd = open(file, O_RDONLY);
        if ( readonly ) {
                db->readonly=1;
                db->fd = open(file, O_RDONLY);
+               if ( db->fd < 0 ) {
+                       tlog(TL_CRIT,"FDBOpen: open failed: %s",strerror(errno));
+                       return FDB_ERROR;
+               }
                if ( flock( db->fd, LOCK_SH ) < 0 ) {
                        tlog(TL_CRIT,"FDBOpen: flock failed: %s",strerror(errno));
                        close(db->fd);
                        return FDB_ERROR;
                } 
        } else {
                if ( flock( db->fd, LOCK_SH ) < 0 ) {
                        tlog(TL_CRIT,"FDBOpen: flock failed: %s",strerror(errno));
                        close(db->fd);
                        return FDB_ERROR;
                } 
        } else {
-               db->fd = open(file, O_CREAT | O_RDWR, 0666);
+               db->fd = open(file, O_CREAT | O_RDWR);
+               if ( db->fd < 0 ) {
+                       tlog(TL_CRIT,"FDBOpen: open failed: %s",strerror(errno));
+                       return FDB_ERROR;
+               }
                if ( flock( db->fd, LOCK_EX ) < 0 ) {
                        tlog(TL_CRIT,"FDBOpen: flock failed: %s",strerror(errno));
                        close(db->fd);
                if ( flock( db->fd, LOCK_EX ) < 0 ) {
                        tlog(TL_CRIT,"FDBOpen: flock failed: %s",strerror(errno));
                        close(db->fd);
@@ -130,12 +138,6 @@ FDBOpen(FDB *db, char *file, int readonly) {
                } 
        }
 
                } 
        }
 
-       if ( db->fd < 0 ) {
-               memset(db, 0, sizeof(FDB));
-               tlog(TL_CRIT,"FDBOpen: open failed: %s", strerror(errno));
-               return FDB_ERROR;
-       }
-
        rc = read(db->fd, &header, sizeof(FDBHeader));
 
        if ( rc<0 ) {
        rc = read(db->fd, &header, sizeof(FDBHeader));
 
        if ( rc<0 ) {
diff --git a/gendata.c b/gendata.c
new file mode 100644 (file)
index 0000000..3b7ae6f
--- /dev/null
+++ b/gendata.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2005 Teodor Sigaev <teodor@sigaev.ru>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ *      may be used to endorse or promote products derived from this software
+ *      without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY CONTRIBUTORS ``AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <ctype.h>
+
+#include "tlog.h"
+
+static void
+usage() {
+        puts(
+        "Usage:\n"
+        "tbtreetest  -b [-r -c COUNT] \n"
+        );
+        exit(1);
+}
+
+extern char *optarg;
+extern int opterr;
+
+#define NO_MODE                0
+#define MODE_BTREE     1
+
+
+int
+main(int argn, char *argv[]) {
+        int mode=NO_MODE;
+       int isrnd=0, count=10,i; 
+
+        opentlog(TL_OPEN_STDERR,TL_DEBUG, NULL);
+          
+        while((i=getopt(argn,argv,"brc:")) != EOF) {
+                switch(i) {
+                        case 'r':
+                                isrnd=1;
+                                break;
+                        case 'b':
+                               if ( mode ) usage();
+                                mode = MODE_BTREE;
+                                break;
+                        case 'c':
+                                count=atoi(optarg);
+                                break;
+                        case 'h':
+                       default:
+                               usage();
+               }
+       }
+
+       if ( mode==MODE_BTREE ) {
+               int j, cnt,c;
+               for(i=0;i<count;i++) {
+                       printf("I\t%d\t", (int)( (isrnd) ? random()%count : i ));
+                       cnt=1+random()%512;
+                       for(j=0;j<cnt;j++) {
+                               do { c=random()%128; } while ( !isalnum(c) );   
+                               fputc(c, stdout);
+                       }
+                       fputc('\n', stdout);
+               }
+       } else {
+               usage();
+       }
+       return 0;
+}
diff --git a/tbtree.c b/tbtree.c
new file mode 100644 (file)
index 0000000..1351cc8
--- /dev/null
+++ b/tbtree.c
@@ -0,0 +1,1003 @@
+/*
+ * Copyright (c) 2005 Teodor Sigaev <teodor@sigaev.ru>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ *      may be used to endorse or promote products derived from this software
+ *      without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY CONTRIBUTORS ``AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/file.h>
+
+#include "tmalloc.h"
+#include "tlog.h"
+
+#include "tbtree.h"
+
+static int TBTReadPage(TBTree *db, u_int32_t pagenumber, TBTMemPage **page) ;
+static int TBTWritePage(TBTree *db, TBTMemPage *page) ;
+static int TBTNewPage(TBTree *db, TBTMemPage **page) ;
+
+int 
+TBTOpen(TBTree *db, char *file) {
+       off_t offset;
+
+       if ( db->readonly ) {
+               db->fd = open(file, O_RDONLY);
+               if ( db->fd < 0 ) {
+                       tlog(TL_CRIT,"TBTOpen: open failed: %s",strerror(errno));
+                       return TBT_ERROR;
+               }
+               if ( flock( db->fd, LOCK_SH ) < 0 ) {
+                       tlog(TL_CRIT,"TBTOpen: flock failed: %s",strerror(errno));
+                       close(db->fd);
+                       return TBT_ERROR;
+               }
+       } else {
+               db->fd = open(file, O_CREAT | O_RDWR, 0666);
+               if ( db->fd < 0 ) {
+                       tlog(TL_CRIT,"TBTOpen: open failed: %s",strerror(errno));
+                       return TBT_ERROR;
+               }
+               if ( flock( db->fd, LOCK_EX ) < 0 ) {
+                       tlog(TL_CRIT,"TBTOpen: flock failed: %s",strerror(errno));
+                       close(db->fd);
+                       return TBT_ERROR;
+               }
+       }
+
+       if ( db->npage ) {
+               db->Cache=(TBTMemPage**)t0malloc( sizeof(TBTMemPage*)*db->npage );
+               db->TimeCache=(TBTMemPage**)t0malloc( sizeof(TBTMemPage*)*db->npage );
+               db->curpage=0;
+       }
+
+       db->cachehits = db->pageread = db->pagewrite = 0;
+
+       if ( (offset=lseek(db->fd, 0, SEEK_END)) < 0 ) {
+               tlog(TL_CRIT,"TBTOpen: lseek failed: %s",strerror(errno));
+               flock( db->fd, LOCK_UN );
+               close(db->fd);
+               return TBT_ERROR;
+       }
+
+       if ( offset==0 ) {
+               if ( db->readonly ) {
+                       tlog(TL_CRIT,"TBTOpen: file is void");
+                       flock( db->fd, LOCK_UN );
+                       close(db->fd);
+                       return TBT_ERROR;
+               } else {
+                       TBTMemPage *page;
+                       if ( TBTNewPage(db, &page) != TBT_OK ) {
+                               flock( db->fd, LOCK_UN );
+                               close(db->fd);
+                               return TBT_ERROR;
+                       }
+                       page->page.isleaf=1;
+                       if ( TBTWritePage(db, page)!=TBT_OK ){
+                               flock( db->fd, LOCK_UN );
+                               close(db->fd);
+                               return TBT_ERROR;
+                       }
+               }
+       } 
+
+       db->lastpagenumber=0;                            
+       return TBT_OK; 
+}
+
+int
+TBTClose(TBTree *db) {
+       int rc=0;
+
+       if ( !db->readonly ) 
+               rc=TBTSync(db);
+
+       if ( db->npage ) {
+               u_int32_t i;
+
+               for(i=0; i<db->curpage; i++)
+                       tfree( db->Cache[i] );
+
+               tfree( db->Cache );
+               tfree( db->TimeCache );
+       }
+       
+       flock( db->fd, LOCK_UN );
+       close(db->fd);  
+       
+       return (rc) ? TBT_ERROR : TBT_OK;
+}
+
+int
+TBTSync(TBTree *db) {
+       u_int32_t i;
+       int rc=0;
+
+       if ( db->readonly || db->npage==0 )
+               return TBT_OK;
+       for(i=0; i<db->curpage; i++)
+               if ( !db->Cache[i]->issynced )
+                       rc |= TBTWritePage(db, db->Cache[i]);
+       
+       if ( fsync(db->fd) ) {
+               tlog(TL_CRIT,"TBTSync: fsync failed: %s",strerror(errno));
+               rc=1;
+       }
+
+       return (rc) ? TBT_ERROR : TBT_OK;
+}
+
+static int
+cmpNPage(const void *a, const void *b) {
+       if (  (*(TBTMemPage**)a)->pagenumber == (*(TBTMemPage**)b)->pagenumber ) 
+               return 0;
+       return ( (*(TBTMemPage**)a)->pagenumber > (*(TBTMemPage**)b)->pagenumber ) ? 1 : -1;
+}
+
+static TBTMemPage*
+PageAlloc(TBTree *db, int *pos) {
+       TBTMemPage *page;
+
+       if ( db->curpage < db->npage ) {
+               page = db->Cache[db->curpage] = db->TimeCache[db->curpage] = (TBTMemPage*)t0malloc(sizeof(TBTMemPage));
+               *pos = db->curpage;
+               page->iscached=1;
+               db->curpage++;
+       } else if ( db->npage == 0 ) {
+               page = (TBTMemPage*)t0malloc(sizeof(TBTMemPage));
+       } else {
+               u_int32_t i;
+               for( i=0; i<db->curpage && db->TimeCache[i]->islocked; i++ );
+               if ( i == db->curpage ) { /*all pages locked*/
+                       /*tlog(TL_WARN,"TBTReadPage: all pages in cache locked, increase number of cached pages");*/
+                       page = (TBTMemPage*)t0malloc(sizeof(TBTMemPage));
+               } else {
+                       page = db->TimeCache[i];
+                       if ( !page->issynced )
+                               if ( TBTWritePage(db, page) )
+                                       return NULL;
+                       memset(page,0,sizeof(TBTMemPage));
+                       page->iscached=1;
+                       *pos=i;
+               }
+       }
+
+       return page;
+}
+
+static void
+moveFirstTime(TBTree *db, int pos) {
+       TBTMemPage *ptr;
+       
+       if ( pos < 0 ) 
+               return;
+       if (pos+1==db->curpage )
+               return;
+
+       ptr = db->TimeCache[pos];
+       memmove( db->TimeCache + pos , db->TimeCache + pos + 1, sizeof(TBTMemPage*)*(db->curpage-pos-1) );
+       db->TimeCache[db->curpage-1] = ptr;
+}
+
+static void
+findAndMoveFirstTime(TBTree *db, TBTMemPage *page) {
+       int pos;
+       for(pos=0;pos<db->curpage;pos++)
+               if ( page==db->TimeCache[pos] ) {
+                       moveFirstTime(db,pos);
+                       break;
+               }
+       tassert( pos<db->curpage );
+}
+
+static void
+findAndMovePN(TBTree *db, TBTMemPage *page) {
+       int pos;
+
+       if ( db->curpage < 2 )
+               return;
+
+       for(pos=0;pos<db->curpage;pos++)
+               if ( page==db->Cache[pos] ) 
+                       break;
+       tassert( pos<db->curpage );
+
+       while(1) {
+               if ( pos+1 != db->curpage && db->Cache[pos+1]->pagenumber < page->pagenumber ) {
+                       db->Cache[pos] = db->Cache[pos+1];
+                       db->Cache[pos+1] = page;
+                       pos++; 
+               } else if ( pos!=0 && db->Cache[pos-1]->pagenumber > page->pagenumber ) {
+                       db->Cache[pos] = db->Cache[pos-1];
+                       db->Cache[pos-1] = page;
+                       pos--;
+               } else
+                       break; 
+       }       
+}
+
+static int
+TBTReadPage(TBTree *db, u_int32_t pagenumber, TBTMemPage **page) {
+       TBTMemPage      *ptr, key, **res;
+       int rc;
+       int pos;
+
+       ptr=&key;
+       key.pagenumber = pagenumber;
+
+       res = (TBTMemPage**)bsearch(&ptr, db->Cache, db->curpage, sizeof(TBTMemPage*), cmpNPage);
+       if ( res ) {
+               db->cachehits++;
+               *page=*res;
+               findAndMoveFirstTime(db, *page);
+       } else {
+               off_t  offset = (off_t)pagenumber * (off_t)TBTREEPAGESIZE;
+               
+               if( (*page = PageAlloc(db,&pos))==NULL )
+                       return TBT_ERROR;
+
+               if ( lseek(db->fd, offset, SEEK_SET) != offset ) {
+                       tlog(TL_CRIT,"TBTReadPage: lseek failed: %s",strerror(errno));
+                       return TBT_ERROR;
+               }
+               (*page)->issynced=1;
+               (*page)->pagenumber=pagenumber;
+               if ( (rc=read(db->fd, &( (*page)->page ), TBTREEPAGESIZE)) != TBTREEPAGESIZE ) {
+                       tlog(TL_CRIT,"TBTReadPage: read failed: %s %d %d",strerror(errno), pagenumber, offset);
+                       return TBT_ERROR;
+               }
+               if ( (*page)->iscached ) {
+                       moveFirstTime(db, pos);
+                       findAndMovePN(db, *page);
+               } 
+       }
+
+       db->pageread++;
+       gettimeofday( &( (*page)->last ), NULL );
+       
+       return TBT_OK;   
+}
+
+static int
+TBTWritePage(TBTree *db, TBTMemPage *page) {
+       off_t  offset = (off_t)(page->pagenumber) * (off_t)TBTREEPAGESIZE;
+       off_t  size;
+
+       if ( page->issynced )
+               return TBT_OK;
+
+       if ( lseek(db->fd, offset, SEEK_SET) != offset ) {
+               tlog(TL_CRIT,"TBTWritePage: lseek failed: %s",strerror(errno));
+               return TBT_ERROR;
+       }
+
+       if ( (size=write(db->fd, &( page->page ), TBTREEPAGESIZE)) != TBTREEPAGESIZE ) {
+               tlog(TL_CRIT,"TBTWritePage: write failed: %s (writed only %d bytes)",strerror(errno), size);
+               return TBT_ERROR;
+       }
+
+       page->issynced=1;
+       db->pagewrite++;
+
+       return TBT_OK;
+}
+
+static int
+TBTNewPage(TBTree *db, TBTMemPage **page) {
+       int pos;
+       if ( db->lastpagenumber==0 ) {
+               off_t  offset;
+               if ( (offset=lseek(db->fd, 0, SEEK_END)) < 0 ) {
+                       tlog(TL_CRIT,"TBTNewPage: lseek failed: %s",strerror(errno));
+                       return TBT_ERROR;
+               }
+
+               if ( offset % TBTREEPAGESIZE != 0 ) {
+                       tlog(TL_CRIT,"TBTNewPage: file corrupted");
+                       return TBT_ERROR;
+               }
+
+               db->lastpagenumber = offset/TBTREEPAGESIZE;
+       }
+
+       if( (*page = PageAlloc(db,&pos))==NULL )
+               return TBT_ERROR;
+       (*page)->pagenumber = db->lastpagenumber;
+       (*page)->issynced=0;
+       (*page)->islocked=1;
+
+       db->lastpagenumber++;
+
+       memset( &((*page)->page), 0, sizeof(TBTPAGEHDRSZ));
+       (*page)->page.freespace = TBTREEPAGESIZE-TBTPAGEHDRSZ;
+
+       gettimeofday( &( (*page)->last ), NULL );
+       if ( (*page)->iscached ) {
+               moveFirstTime(db, pos);
+               findAndMovePN(db, *page);
+       }
+       
+       return TBT_OK;
+}
+
+#define        NOTFOUND        0
+#define        FOUND           1
+
+static void
+printValue(u_int32_t len, char *ptr) {
+       putchar('\'');
+       while(len>0) {
+               putchar(*ptr);
+               ptr++; len--;
+       }
+       putchar('\'');
+}
+
+static void
+dumpPage(TBTree *db, TBTPage *page, int follow) {
+       u_int32_t i,j;
+
+       for(j=0;j<follow;j++)
+               putchar(' ');
+       printf("Page: f:%d l:%d r:%d n:%d\n", 
+               page->freespace,
+               page->isleaf,
+               page->rightlink,
+               page->npointer
+       );      
+       for(i=0;i<page->npointer;i++) {
+               TBTPointer *ptr= (TBTPointer*)(page->data+TBTPOINTERSIZE(db)*i);
+               for(j=0;j<follow+2;j++)
+                       putchar(' ');
+               printf("i:%d(%d) kl:%d ko:%d ",
+                       i, (int)ptr, 
+                       (db->keylen) ? db->keylen : ptr->key.varlen.length,
+                       (db->keylen) ? 0 : ptr->key.varlen.offset
+               );
+               if ( db->keylen==0 ) 
+                       printValue(ptr->key.varlen.length, page->data + ptr->key.varlen.offset);
+               else
+                       printf("'%d'", *(int*)(ptr->key.fixed.key));
+               if ( page->isleaf ) {
+                       printf(" vl:%d vo:%d ", 
+                               ptr->pointer.leaf.length,
+                               ptr->pointer.leaf.offset
+                       );
+                       printValue(ptr->pointer.leaf.length,  page->data + ptr->pointer.leaf.offset );
+                       putchar('\n');
+               } else {
+                       printf(" page:%d\n", ptr->pointer.internal.link);
+                       if ( follow>=0 )
+                               dumpTree(db, ptr->pointer.internal.link, follow+4); 
+               }
+       }
+}
+
+void
+dumpTree(TBTree *db, u_int32_t pagenumber, int follow) {
+       TBTMemPage  *page;
+
+       if ( TBTReadPage(db, pagenumber, &page) ) {
+               fprintf(stderr,"BEDA %d\n", pagenumber);
+               return;
+       }
+
+       page->islocked=1;
+       dumpPage(db, &(page->page), follow);
+       page->islocked=0;
+       if ( !page->iscached )
+               tfree(page);
+}
+
+static int
+findInPage(TBTree *db, TBTPage *page, TBTValue *key, TBTPointer **res) {
+       TBTPointer      *StopLow = (TBTPointer*)(page->data);
+       TBTPointer      *StopHigh = (TBTPointer*)(page->data + TBTPOINTERSIZE(db)*page->npointer);
+       TBTPointer      *StopMiddle;
+       int diff ;
+       TBTValue  A;
+
+       /* Loop invariant: StopLow <= StopMiddle < StopHigh */
+       while (StopLow < StopHigh) {
+               StopMiddle = (TBTPointer*)(((char*)StopLow) + TBTPOINTERSIZE(db)*((((char*)StopHigh - (char*)StopLow)/TBTPOINTERSIZE(db)) >> 1));
+               if ( db->keylen == 0 ) {
+                       A.length = StopMiddle->key.varlen.length;
+                       A.value = page->data + StopMiddle->key.varlen.offset;
+               } else {
+                       A.length =  db->keylen;
+                       A.value = StopMiddle->key.fixed.key;
+               }
+       
+               if ( ISINFPOINTER(db, page, StopMiddle) )
+                       diff = 1;
+               else 
+                       diff = db->cmpkey( &A, key );
+               if ( diff == 0 ) {
+                       *res = StopMiddle;
+                       return FOUND;
+               } else if ( diff < 0 )
+                       StopLow = (TBTPointer*)((char*)StopMiddle + TBTPOINTERSIZE(db));
+               else
+                       StopHigh = StopMiddle;
+       }
+
+       *res = StopHigh;
+       return NOTFOUND;
+}
+
+static int
+findPage(TBTree *db, TBTValue *key, TBTMemPage **page) {
+       u_int32_t pagenumber = TBTPAGEROOT;
+       int rc = TBT_OK;
+
+       *page=NULL;
+       while(1) {
+               TBTPointer *ptr;
+
+               if ( (rc=TBTReadPage(db, pagenumber, page))!=TBT_OK )
+                       return rc;
+
+               if ( (*page)->page.isleaf ) 
+                       return TBT_OK;
+
+               findInPage(db, &((*page)->page), key, &ptr );
+               pagenumber = ptr->pointer.internal.link;
+               if ( !(*page)->iscached )
+                       tfree(*page);
+       }
+
+       return TBT_OK;
+}
+
+int
+TBTFind(TBTree *db, TBTValue *key, TBTValue *value) {
+       TBTMemPage      *page;
+       TBTPointer *ptr;
+       int rc;
+
+       memset(value, 0, sizeof(TBTValue));
+
+       if ( (rc=findPage(db, key, &page))!=TBT_OK )
+               return rc; 
+               
+       if ( page == NULL )
+               return TBT_OK;
+
+       if ( findInPage(db, &(page->page), key, &ptr) != FOUND ) {
+               if ( !(page)->iscached )
+                       tfree(page);
+               return TBT_OK;
+       } 
+
+       value->length = ptr->pointer.leaf.length;
+       value->value = tmalloc(value->length);
+       memcpy( value->value, page->page.data + ptr->pointer.leaf.offset, value->length );
+
+       if ( !(page)->iscached )
+               tfree(page);
+
+       return TBT_OK;  
+}
+
+static void
+packLeafKV(TBTree *db, TBTPage *page, TBTPointer *ptr, TBTValue *key, TBTValue *value) {
+       if ( ptr ) {
+               if ( (char*)ptr - page->data  != page->npointer * TBTPOINTERSIZE(db) ) 
+                       memmove((char*)ptr+TBTPOINTERSIZE(db), ptr, page->npointer * TBTPOINTERSIZE(db) - ( (char*)ptr - page->data ));
+       } else
+               ptr = (TBTPointer*)(page->data + page->npointer * TBTPOINTERSIZE(db));
+
+       page->npointer++;
+       page->freespace -= TBTPOINTERSIZE(db) + PTRALIGN(value->length);
+       if ( db->keylen ) {
+               memcpy( ptr->key.fixed.key, key->value, db->keylen );
+               ptr->pointer.leaf.offset = page->npointer * TBTPOINTERSIZE(db) + page->freespace;
+       } else {
+               page->freespace -= PTRALIGN(key->length);
+               ptr->key.varlen.length = key->length;
+               ptr->key.varlen.offset = page->npointer * TBTPOINTERSIZE(db) + page->freespace;
+               memcpy( page->data + ptr->key.varlen.offset, key->value, key->length);
+               ptr->pointer.leaf.offset = page->npointer * TBTPOINTERSIZE(db) + page->freespace + PTRALIGN(key->length);
+       } 
+       ptr->pointer.leaf.length = value->length;
+       memcpy( page->data + ptr->pointer.leaf.offset, value->value, value->length );
+}
+
+static void
+packInternalKV(TBTree *db, TBTPage *page, TBTPointer *ptr, TBTValue *key, u_int32_t pagenumber) {
+       if ( ptr ) {
+               if ( (char*)ptr - page->data  != page->npointer * TBTPOINTERSIZE(db) ) 
+                       memmove((char*)ptr+TBTPOINTERSIZE(db), ptr, page->npointer * TBTPOINTERSIZE(db) - ( (char*)ptr - page->data ));
+       } else
+               ptr = (TBTPointer*)(page->data + page->npointer * TBTPOINTERSIZE(db));
+
+       page->npointer++;
+       page->freespace -= TBTPOINTERSIZE(db);
+
+       if ( db->keylen ) {
+               if ( key->value )
+                       memcpy( ptr->key.fixed.key, key->value, db->keylen );
+               else
+                       memset( ptr->key.fixed.key, 0, db->keylen );
+       } else {
+               page->freespace -= PTRALIGN(key->length);
+               ptr->key.varlen.length = key->length;
+               ptr->key.varlen.offset = page->npointer * TBTPOINTERSIZE(db) + page->freespace;
+               if ( key->length ) 
+                       memcpy( page->data + ptr->key.varlen.offset, key->value, key->length);
+       }
+       ptr->pointer.internal.link = pagenumber;
+}
+
+
+static void
+deleteKV(TBTree *db, TBTPage *page, TBTPointer **ptrs, u_int32_t num) {
+       static char buf[TBTREEPAGESIZE];
+       char  *bufptr = buf;
+       TBTPointer *ptr = (TBTPointer*)page->data, *tmp, **ptrptr = ptrs;
+       u_int32_t i, start;
+       /* suppose ptrs are ordered! */
+       tmp = ptr;
+       while( ptrptr - ptrs < num && (char*)ptr - page->data < TBTPOINTERSIZE(db)*page->npointer ) {
+               if ( *ptrptr < ptr ) {
+                       tlog(TL_CRIT,"deleteKV: Something wrong: unexistent *ptrs"); 
+                       ptrptr++;
+               } else if ( *ptrptr > ptr ) {
+                       if ( tmp != ptr )
+                               memcpy(tmp, ptr, TBTPOINTERSIZE(db));
+                       ptr = (TBTPointer*)( (char*)ptr + TBTPOINTERSIZE(db) );
+                       tmp = (TBTPointer*)( (char*)tmp + TBTPOINTERSIZE(db) );
+               } else {
+                       page->freespace += TBTPOINTERSIZE(db) + 
+                               ( ( page->isleaf ) ? PTRALIGN(ptr->pointer.leaf.length) : 0 ) +
+                               ( ( db->keylen ) ? 0 : PTRALIGN(ptr->key.varlen.length) );
+                       ptrptr++;
+                       ptr = (TBTPointer*)( (char*)ptr + TBTPOINTERSIZE(db) );
+               }
+       }
+
+       if ( (char*)ptr - page->data < TBTPOINTERSIZE(db)*page->npointer && tmp!=ptr )
+               memmove( tmp, ptr, page->npointer * TBTPOINTERSIZE(db) - ((char*)ptr - page->data) );
+
+       page->npointer-=num;
+
+       tmp = (TBTPointer*)page->data;
+       start = page->npointer * TBTPOINTERSIZE(db) + page->freespace;
+       for(i=0;i<page->npointer;i++) {
+               if ( page->isleaf ) {
+                       memcpy( bufptr, page->data + tmp->pointer.leaf.offset, tmp->pointer.leaf.length );
+                       tmp->pointer.leaf.offset = start + (bufptr-buf);
+                       bufptr+=PTRALIGN(tmp->pointer.leaf.length);
+               }
+
+               if ( db->keylen == 0 ) {
+                       if ( tmp->key.varlen.length )
+                               memcpy( bufptr, page->data + tmp->key.varlen.offset, tmp->key.varlen.length );
+                       tmp->key.varlen.offset = start + (bufptr-buf);
+                       bufptr+=PTRALIGN(tmp->key.varlen.length);
+               }
+               tmp = (TBTPointer*)( (char*)tmp + TBTPOINTERSIZE(db) );
+       }
+
+       memcpy( page->data + start, buf, bufptr-buf );
+}
+
+static u_int32_t
+findDelimiter(TBTree *db, TBTPage *page, TBTPointer *ptr, int size, u_int32_t start, int *where) {
+       int  *sizes = (int*) tmalloc( sizeof(int) * (page->npointer+1) );
+       int i;
+       TBTPointer *tomove;
+       int lfree=TBTREEPAGESIZE-TBTPAGEHDRSZ, rfree=TBTREEPAGESIZE-TBTPAGEHDRSZ;
+       int nptr = ( (char*)ptr - page->data ) / TBTPOINTERSIZE(db); 
+       int was=0;
+
+       for(i=0; i<page->npointer;i++) {
+               tomove = (TBTPointer*)(page->data + i*TBTPOINTERSIZE(db));
+               if ( was==0 && i==nptr ) {
+                       sizes[i] = size;
+                       was=1;
+               } 
+               sizes[i+was] = TBTPOINTERSIZE(db) +
+                       ( (page->isleaf) ? PTRALIGN(tomove->pointer.leaf.length) : 0 ) +
+                       ( ( db->keylen ) ? 0 : PTRALIGN(tomove->key.varlen.length) );
+       
+       }
+       
+       if ( was==0 )
+               sizes[page->npointer]=size;
+
+       for(i=0;i<page->npointer+1;i++) {
+               if ( i< start )
+                       lfree-=sizes[i];
+               else 
+                       rfree-=sizes[i];
+       }
+
+       while( 1 ) {
+               if ( lfree<0 ) {
+                       lfree+=sizes[start];
+                       rfree-=sizes[start];
+                       start--;
+               } else if ( rfree < 0 ) { 
+                       lfree-=sizes[start];
+                       rfree+=sizes[start];
+                       start++;
+               } else
+                       break;
+       }
+
+       *where = (nptr < start) ? -1 : 1;
+       if ( start>nptr )
+               start--;
+
+       tfree(sizes);
+       return start;
+}
+
+static void
+populatePages(TBTree *db, TBTPage *srcpage, TBTPage* newpage, u_int32_t start) {
+       u_int32_t i;
+       TBTPointer *tomove, **todelete = tmalloc( sizeof(TBTPointer*)* srcpage->npointer );
+       u_int32_t numtodelete=0;
+       TBTValue k, v;
+
+       for(i=start;i<srcpage->npointer;i++) {
+               tomove = (TBTPointer*)(srcpage->data + i*TBTPOINTERSIZE(db));
+
+               todelete[numtodelete] = tomove;
+               numtodelete++;
+
+               if ( db->keylen ) 
+                       k.value = tomove->key.fixed.key;
+               else {
+                       k.length = tomove->key.varlen.length;
+                       k.value = srcpage->data + tomove->key.varlen.offset;
+               } 
+                               
+               if ( srcpage->isleaf ) {
+                       v.length = tomove->pointer.leaf.length;
+                       v.value = srcpage->data + tomove->pointer.leaf.offset;
+                       packLeafKV(db, newpage, NULL, &k, &v);
+               } else 
+                       packInternalKV(db, newpage, NULL, &k, tomove->pointer.internal.link);
+       }
+
+       deleteKV(db, srcpage, todelete, numtodelete);   
+       tfree(todelete);
+}
+
+static int
+splitPage(TBTree *db, TBTMemPage *srcpage, TBTMemPage** newpage, TBTPointer **ptr, int size, TBTValue *key, TBTValue *value, u_int32_t pagenumber) {
+       TBTMemPage *tmp;
+       int start=0, where=0;
+       int nptr = ( (char*)(*ptr) - srcpage->page.data ) / TBTPOINTERSIZE(db);
+
+       if ( TBTNewPage(db, newpage) )
+               return TBT_ERROR;
+
+       tmp = *newpage;
+       srcpage->issynced=tmp->issynced=0;
+       tmp->islocked=1;
+       tmp->page.rightlink = srcpage->page.rightlink;
+       srcpage->page.rightlink = tmp->pagenumber;
+       tmp->page.isleaf = srcpage->page.isleaf;
+
+       switch(db->strategy) {
+               case TBT_STATEGY_HALF:
+                       start = findDelimiter(db, &(srcpage->page), *ptr, size, (srcpage->page.npointer+1)/2, &where);  
+                       break;
+               case TBT_STATEGY_LEFTFILL:
+                       start = findDelimiter(db, &(srcpage->page), *ptr, size, srcpage->page.npointer, &where);        
+                       break;
+               case TBT_STATEGY_RIGHTFILL:
+                       start = findDelimiter(db, &(srcpage->page), *ptr, size, 0, &where);     
+                       break;
+               default:
+                       tlog(TL_CRIT,"splitPage: unknown strategy: %d", db->strategy);
+                       return TBT_ERROR;
+       }
+
+       populatePages(db, &(srcpage->page), &(tmp->page), start);
+       if ( where<0 )
+               tmp=srcpage;
+       else
+               nptr-=start;
+       *ptr = (TBTPointer*)(tmp->page.data + nptr*TBTPOINTERSIZE(db));
+
+       if ( tmp->page.isleaf ) 
+               packLeafKV(db, &(tmp->page), *ptr, key, value);
+       else
+               packInternalKV(db, &(tmp->page), *ptr, key, pagenumber);
+
+       return TBT_OK;
+}
+
+static int
+savePage(TBTree *db, TBTMemPage *page) {
+       int rc;
+       page->islocked=0;
+
+       if ( !page->iscached ) {
+               if (  page->issynced==0 && (rc=TBTWritePage(db, page)) != TBT_OK )
+                       return rc;
+               tfree(page);
+       }
+       return TBT_OK;
+}
+
+static int
+layerInsert(TBTree *db, u_int32_t pagenumber, TBTMemPage **left, TBTMemPage **right, TBTValue *key, TBTValue *value) {
+       int rc,fnd;
+       TBTPointer      *ptr=NULL;
+       TBTMemPage      *page=NULL, *newright=NULL;
+                       
+       if ( (rc=TBTReadPage(db, pagenumber, &page))!=TBT_OK )
+               return rc;
+
+       page->islocked=1;
+
+       fnd = findInPage(db, &(page->page), key, &ptr );
+
+       *left=*right=NULL;
+       if ( page->page.isleaf ) {
+               u_int32_t size = TBTPOINTERSIZE(db) +
+                       PTRALIGN(value->length) +
+                       ( ( db->keylen ) ? 0 : PTRALIGN(key->length) );
+               
+               if ( size>(TBTREEPAGESIZE-TBTPAGEHDRSZ)/2 ) {
+                       tlog(TL_ALARM,"Huge size: %d bytes", size);
+                       return TBT_HUGE;
+               }
+
+               if (fnd==FOUND) {
+                       deleteKV(db, &(page->page), &ptr, 1);
+                       page->issynced=0;
+               }
+
+               if ( size <= page->page.freespace ) {
+                       packLeafKV(db, &(page->page), ptr, key, value);
+                       page->issynced=0;
+               } else {
+                       rc = splitPage(db, page, &newright, &ptr, size, key, value, 0);
+                       if ( rc != TBT_OK ) return rc;
+
+                       page->issynced = newright->issynced = 0;
+                       page->islocked = newright->islocked = 1;
+               }
+       } else {
+               if ( (rc=layerInsert(db, ptr->pointer.internal.link, left, right, key, value))!=TBT_OK ) {
+                       page->islocked=0;
+                       return rc;
+               }
+
+               if ( *left ) { /* child page was splited */
+                       u_int32_t size;
+                       TBTPointer *rightptr = (TBTPointer*)( (*left)->page.data + ((*left)->page.npointer-1) * TBTPOINTERSIZE(db) );
+                       TBTValue K;
+
+                       if ( ISINFPOINTER(db, &(page->page), ptr) ) {
+                               deleteKV(db, &(page->page), &ptr, 1);
+                               K.length = 0; K.value = NULL;
+                               packInternalKV(db, &(page->page), ptr, &K, (*right)->pagenumber);
+                       } else 
+                               ptr->pointer.internal.link=(*right)->pagenumber;
+
+                       K.length = ( db->keylen ) ? db->keylen : rightptr->key.varlen.length;
+                       K.value  = ( db->keylen ) ? rightptr->key.fixed.key : (*left)->page.data + rightptr->key.varlen.offset;
+
+                       size = TBTPOINTERSIZE(db) + ( ( db->keylen ) ? 0 : PTRALIGN(K.length) );
+
+                       if ( size <= page->page.freespace ) {
+                               packInternalKV(db, &(page->page), ptr, &K, (*left)->pagenumber);
+                               page->issynced=0;
+                       } else {
+                               rc = splitPage(db, page, &newright, &ptr, size, &K, NULL, (*left)->pagenumber);
+                               if ( rc != TBT_OK ) return rc;
+
+                               page->issynced = newright->issynced = 0;
+                               page->islocked = newright->islocked = 1;
+                       }
+               }
+       }
+
+       if ( *left ) {
+               if ( (rc=savePage(db, *left))!=TBT_OK )
+                       return rc;
+               if ( (rc=savePage(db, *right))!=TBT_OK )
+                       return rc;
+               *left=*right=NULL;
+       }
+
+       if ( newright ) {
+               if ( page->pagenumber==TBTPAGEROOT ) { /* root was splited */
+                       TBTMemPage  *newleft;
+                       TBTValue K;
+               
+
+                       if  ( (rc=TBTNewPage(db,&newleft))!=TBT_OK )
+                               return rc;
+                       memcpy( &(newleft->page) , &(page->page), TBTREEPAGESIZE);
+                       memset( &(page->page), 0, TBTREEPAGESIZE);
+                       page->page.freespace=TBTREEPAGESIZE-TBTPAGEHDRSZ;
+       
+                       ptr = (TBTPointer*)( newleft->page.data + (newleft->page.npointer-1) * TBTPOINTERSIZE(db) ); 
+                       K.length = ( db->keylen ) ? db->keylen : ptr->key.varlen.length;
+                       K.value  = ( db->keylen ) ? ptr->key.fixed.key : newleft->page.data + ptr->key.varlen.offset;
+                       packInternalKV(db, &(page->page), NULL, &K, newleft->pagenumber);
+               
+                       ptr = (TBTPointer*)( newright->page.data + (newright->page.npointer-1) * TBTPOINTERSIZE(db) ); 
+                       K.length = 0;
+                       K.value  = NULL;
+                       packInternalKV(db, &(page->page), NULL, &K, newright->pagenumber);
+
+                       if ( (rc=savePage(db, newright))!=TBT_OK )
+                               return rc;
+                       if ( (rc=savePage(db, newleft))!=TBT_OK )
+                               return rc;
+                       if ( (rc=savePage(db, page))!=TBT_OK )
+                               return rc;
+               } else {
+                       *left = page;
+                       *right = newright;
+               }
+       } else if ( (rc=savePage(db, page))!=TBT_OK )
+               return rc;
+       
+
+       return TBT_OK;
+} 
+
+int
+TBTInsert(TBTree *db, TBTValue *key, TBTValue *value) {
+       int rc;
+       TBTMemPage *lpage=NULL, *rpage=NULL;
+
+       rc = layerInsert(db, TBTPAGEROOT, &lpage, &rpage, key, value);
+
+       tassert( lpage==NULL && rpage==NULL );
+
+       return rc;
+}
+
+int
+TBTDelete(TBTree *db, TBTValue *key) {
+       TBTMemPage      *page;
+       TBTPointer *ptr;
+       int rc;
+
+       if ( (rc=findPage(db, key, &page))!=TBT_OK )
+               return rc; 
+               
+       if ( page == NULL )
+               return TBT_OK;
+
+       if ( findInPage(db, &(page->page), key, &ptr) != FOUND ) {
+               if ( !(page)->iscached )
+                       tfree(page);
+               return TBT_OK;
+       } 
+
+       deleteKV(db, &(page->page), &ptr, 1);
+       page->issynced=0;
+
+       rc=savePage(db, page);
+
+       return rc;      
+}
+
+int 
+TBTInitIterator(TBTree *db, TBTIterator *iterator) {
+       TBTMemPage      *page=NULL;
+       TBTPointer *ptr;
+       int rc;
+       u_int32_t pagenum=TBTPAGEROOT;
+
+       memset(iterator, 0, sizeof(TBTIterator));
+
+       do {
+               if ( page && !page->iscached )
+                       tfree(page);
+               if ( (rc=TBTReadPage(db, pagenum, &page))!=TBT_OK )
+                       return rc;
+
+               ptr = (TBTPointer*)( page->page.data );
+               pagenum = ptr->pointer.internal.link;
+       } while( !page->page.isleaf );
+
+       page->islocked=1;
+       iterator->page = page;  
+       iterator->ptr = ptr;    
+
+       return TBT_OK;
+}
+
+void 
+TBTFreeIterator(TBTree *db, TBTIterator *iterator) {
+       
+       if ( iterator->page ) {
+               iterator->page->islocked=0;
+               if ( !iterator->page->iscached )
+                       tfree(iterator->page);
+       }
+
+       memset(iterator, 0, sizeof(TBTIterator));
+}
+
+int 
+TBTIterate(TBTree *db, TBTIterator *iterator, TBTValue *key, TBTValue *value ) {
+       int rc;
+
+       if (  iterator->page==NULL ) {
+               memset(key, 0, sizeof(TBTValue));
+               memset(value, 0, sizeof(TBTValue));
+               return TBT_OK;
+       }
+
+       if ( (char*)(iterator->ptr) - iterator->page->page.data >= TBTPOINTERSIZE(db) * iterator->page->page.npointer ) {
+               do {
+                       u_int32_t pagenumber = iterator->page->page.rightlink;
+                       iterator->page->islocked=0;
+                       if ( !iterator->page->iscached )
+                               tfree(iterator->page);
+                       iterator->page=NULL;
+
+                       if ( pagenumber==TBTPAGEROOT ) {
+                               memset(key, 0, sizeof(TBTValue));
+                               memset(value, 0, sizeof(TBTValue));
+                               return TBT_OK;
+                       }
+                       if ( (rc=TBTReadPage(db, pagenumber, &(iterator->page)))!=TBT_OK )
+                               return rc;
+               } while ( iterator->page->page.npointer == 0 );
+
+               iterator->page->islocked=1;
+               iterator->ptr = (TBTPointer*)( iterator->page->page.data );
+       }
+
+       if ( db->keylen ) {
+               key->length = db->keylen;
+               key->value = iterator->ptr->key.fixed.key;
+       } else {
+               key->length = iterator->ptr->key.varlen.length; 
+               key->value = iterator->page->page.data + iterator->ptr->key.varlen.offset;
+       }
+
+       value->length = iterator->ptr->pointer.leaf.length; 
+       value->value = iterator->page->page.data + iterator->ptr->pointer.leaf.offset;
+
+       iterator->ptr = ( TBTPointer* ) (  (char*)(iterator->ptr) + TBTPOINTERSIZE(db) );
+
+       return TBT_OK;
+}
+
+
diff --git a/tbtree.h b/tbtree.h
new file mode 100644 (file)
index 0000000..fa11321
--- /dev/null
+++ b/tbtree.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2005 Teodor Sigaev <teodor@sigaev.ru>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ *      may be used to endorse or promote products derived from this software
+ *      without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY CONTRIBUTORS ``AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __TBTREE_H__
+#define __TBTREE_H__
+
+
+#include <sys/types.h>
+
+/* C-utils */
+#ifndef offsetof
+#define offsetof(type, field)   ((int) &((type *)0)->field)
+#endif   /* offsetof */
+
+#define TYPEALIGN(ALIGNVAL,LEN)  \
+        (((long) (LEN) + (ALIGNVAL-1)) & ~((long) (ALIGNVAL-1)))
+
+#define PTRALIGN(LEN)     TYPEALIGN(sizeof(void*), (LEN))
+
+/* end utils */
+
+
+typedef struct {
+       union {
+               struct {
+                       u_int16_t offset;
+                       u_int16_t length;
+               } leaf;
+               struct {
+                       u_int32_t link;
+               } internal;
+       } pointer;
+       union {
+               struct {
+                       u_int16_t       offset;
+                       u_int16_t       length;
+               } varlen;
+               struct {
+                       char key[1];
+               } fixed;
+       } key;
+} TBTPointer;
+
+#define TBTPOINTERHRDSZ        (offsetof(TBTPointer, key.fixed.key))
+#define TBTPOINTERSIZE(db)     PTRALIGN( (db->keylen) ? TBTPOINTERHRDSZ + db->keylen : sizeof(TBTPointer) )
+#define ISINFPOINTER(db, page, ptr)  ( (page)->isleaf==0 && (page)->rightlink == 0 && (char*)(ptr) == (page)->data + ((page)->npointer-1) * TBTPOINTERSIZE(db) ) 
+
+#define TBTREEPAGESIZE 8192
+#define TBTPAGEHDRSZ   (2*sizeof(u_int32_t))
+
+typedef struct {
+       u_int32_t       rightlink;
+       u_int32_t       
+               freespace:13, /* correlate to BTREEPAGESIZE */
+               npointer:10,
+               isleaf:1,
+               unused: 8;
+       char    data[TBTREEPAGESIZE-TBTPAGEHDRSZ];      
+} TBTPage;
+
+typedef struct {
+       u_int32_t       pagenumber;
+       struct timeval last;
+       u_int32_t
+               issynced:1,
+               iscached:1,
+               islocked:1,
+               unused:29;
+       TBTPage page;
+} TBTMemPage;
+
+typedef struct {
+       u_int16_t       length;
+       char            *value;
+} TBTValue;
+
+#define TBT_STATEGY_HALF       0
+#define TBT_STATEGY_LEFTFILL   1
+#define TBT_STATEGY_RIGHTFILL  2
+
+typedef struct {
+       int     fd;     /* file descriptor */
+
+       int     (*cmpkey)(const TBTValue *, const TBTValue *);  
+
+       u_int32_t
+               readonly:1,
+               keylen:11,
+               strategy:2,
+               unused:18;
+
+       /* cache page subsystem */
+       u_int32_t       npage;
+       u_int32_t       curpage;
+       TBTMemPage      **Cache;
+       TBTMemPage      **TimeCache;
+       u_int32_t       lastpagenumber;
+
+       /* stat subsystem */
+       u_int32_t       cachehits;
+       u_int32_t       pageread; /* include cachehits */
+       u_int32_t       pagewrite;
+} TBTree;
+
+#define TBT_OK                 0
+#define TBT_ERROR      1
+#define TBT_HUGE       2
+
+#define TBTPAGEROOT    0
+
+int TBTOpen(TBTree *db, char *file) ;
+int TBTClose(TBTree *db) ;
+int TBTSync(TBTree *db) ;
+int TBTFind(TBTree *db, TBTValue *key, TBTValue *value);
+int TBTInsert(TBTree *db, TBTValue *key, TBTValue *value);
+int TBTDelete(TBTree *db, TBTValue *key);
+void dumpTree(TBTree *db, u_int32_t pagenumber, int follow);
+
+typedef struct {
+       TBTMemPage      *page;
+       TBTPointer      *ptr;
+} TBTIterator;
+
+int TBTInitIterator(TBTree *db, TBTIterator *iterator );
+int TBTIterate(TBTree *db, TBTIterator *iterator, TBTValue *key, TBTValue *value );
+void TBTFreeIterator(TBTree *db, TBTIterator *iterator);
+
+
+#endif
diff --git a/tbtreetest.c b/tbtreetest.c
new file mode 100644 (file)
index 0000000..c99578e
--- /dev/null
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2004 Teodor Sigaev <teodor@sigaev.ru>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ *      may be used to endorse or promote products derived from this software
+ *      without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY CONTRIBUTORS ``AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <ctype.h>
+
+#include "tlog.h"
+#include "tmalloc.h"
+#include "tbtree.h"
+
+static void
+usage() {
+       puts(
+       "Usage:\n"
+       "tbtreetest [ -c CACHESIZE ] [-r] [-k] [-f FILE] [-D | -L | -b | -i KEY -v VALUE [ -S strategynumber ] | -d KEY | -s KEY ]\n"
+       );
+       exit(1);
+}
+
+extern char *optarg;
+extern int opterr;
+
+#define MODE_SEARCH    1
+#define MODE_INSERT    2
+#define MODE_DELETE    3
+#define MODE_DUMP      4
+#define MODE_LIST      5
+#define MODE_BULK      6
+
+
+static TBTValue K,V;
+static int Ki;
+
+static void
+fillKV(int keylen, char *key, char *value) {
+       if ( keylen ) {
+               Ki=atoi(key);
+               K.value=(char*)&Ki;
+       } else {
+               K.length = strlen(key);
+               K.value = key;
+       }
+       if( value ) {
+               V.length = strlen(value);
+               V.value = value;
+       }
+}
+
+static int
+cmpINT(const TBTValue *a, const TBTValue *b) {
+       if ( *(int*)(a->value) == *(int*)(b->value) ) return 0;
+       return ( *(int*)(a->value) > *(int*)(b->value) ) ? 1 : -1;
+} 
+
+static int
+cmpSTR(const TBTValue *a, const TBTValue *b) {
+       int rc = strncmp( a->value, b->value, (a->length > b->length) ?  b->length : a->length);
+       if ( rc ) return rc;
+       if ( a->length ==  b->length ) return 0;
+       return ( a->length > b->length ) ? 1 : -1;
+}
+
+static void
+printLSTR(u_int32_t len, char *ptr) {
+        while(len>0) {
+                putchar(*ptr);
+                ptr++; len--;
+        }
+}
+
+int
+main(int argn, char *argv[]) {
+       TBTree db;
+       int i;
+       int rc=0;
+       char *file="BTREE";
+       char *key=NULL, *val=NULL;
+       int mode=0;
+
+       opentlog(TL_OPEN_STDERR,TL_DEBUG, NULL);
+
+       memset(&db, 0, sizeof(TBTree));
+
+       while((i=getopt(argn,argv,"bS:Dc:hrkf:i:v:d:s:L")) != EOF) {
+               switch(i) {
+                       case 'r':
+                               db.readonly=1;
+                               break;
+                       case 'f':
+                               file = strdup(optarg);
+                               break;
+                       case 'k':
+                               db.keylen=sizeof(int);
+                               break;
+                       case 'i':
+                               key = strdup(optarg);
+                               mode=MODE_INSERT;
+                               break;
+                       case 'd':
+                               key = strdup(optarg);
+                               mode=MODE_DELETE;
+                               break;
+
+                               key = strdup(optarg);
+                               mode=MODE_SEARCH;
+                               break;
+                       case 'v':
+                               val = strdup(optarg); 
+                               mode=MODE_INSERT;
+                               break;
+                       case 'c':
+                               db.npage = atoi(optarg);
+                               break;
+                       case 'S':
+                               db.strategy = atoi(optarg);
+                               if ( db.strategy > 2 )
+                                       db.strategy = TBT_STATEGY_RIGHTFILL;
+                               break;
+                       case 'D':
+                               mode=MODE_DUMP;
+                               break;
+                       case 'L':
+                               mode=MODE_LIST;
+                               break;
+                       case 'b':
+                               mode=MODE_BULK;
+                               break;
+                       case 'h':
+                       default:
+                               usage();
+                               break;
+               }
+       }
+       
+       db.cmpkey = (db.keylen) ? cmpINT : cmpSTR;
+                       
+       if ( (rc=TBTOpen(&db, file))!= TBT_OK )
+               exit(1);
+
+       if ( mode==MODE_INSERT && key && val ) {
+               fillKV(db.keylen, key, val);
+               rc = TBTInsert(&db, &K, &V);
+       } else if ( mode==MODE_SEARCH && key ) {
+               fillKV(db.keylen, key, NULL);
+               rc = TBTFind(&db, &K, &V);
+               if ( rc == TBT_OK ) {
+                       if ( V.value ) {
+                               int i;
+                               printf("Value: '");
+                               for(i=0;i<V.length;i++)
+                                       fputc(V.value[i], stdout);
+                               fputc('\'', stdout);
+                               fputc('\n', stdout);
+                       } else {
+                               printf("Not found\n");
+                       }
+               }
+       } else if ( mode==MODE_DELETE && key ) {
+               fillKV(db.keylen, key, NULL);
+               rc = TBTDelete(&db, &K);
+       } else if ( mode==MODE_DUMP ) {
+               dumpTree(&db, TBTPAGEROOT, 0);
+       } else if ( mode==MODE_LIST ) {
+               TBTIterator     iterator;
+               if ( (rc=TBTInitIterator(&db, &iterator))==TBT_OK ) {
+                       TBTValue key, value;
+                       while(1) { 
+                               rc = TBTIterate(&db, &iterator, &key, &value); 
+                               if ( rc == TBT_OK && key.value ) {
+                                       fputs("I\t", stdout);
+                                       if ( db.keylen )
+                                               printf("%d", *(int*)(key.value));
+                                       else 
+                                               printLSTR(key.length, key.value);
+                                       fputc('\t', stdout);
+                                       printLSTR(value.length, value.value);
+                                       fputc('\n', stdout);
+                               } else {
+                                       break;
+                               }
+                       }
+
+               }
+               TBTFreeIterator(&db, &iterator);
+       } else if ( mode==MODE_BULK ) {
+               char buf[TBTREEPAGESIZE];
+               int tmp;
+               TBTValue key, value;
+               char *ptr ;
+
+               rc=TBT_OK;
+               while( rc==TBT_OK && fgets(buf, TBTREEPAGESIZE-1, stdin) ) {
+                       if ( *buf == 'I' ) {
+                               ptr=buf+1;
+                               while( *ptr && isspace(*ptr) ) ptr++;
+                               key.value=ptr;
+                               while( *ptr && !isspace(*ptr) ) ptr++;
+                               key.length = ptr-key.value;
+                               while( *ptr && isspace(*ptr) ) ptr++;
+                               value.value=ptr;
+                               while( *ptr && !isspace(*ptr) ) ptr++;
+                               value.length = ptr-value.value;
+                               if ( db.keylen ) {
+                                       tmp=atoi(key.value);
+                                       key.length = db.keylen;
+                                       key.value=(char*)&tmp;
+                               }
+                               rc=TBTInsert(&db, &key, &value);
+                       } else if ( *buf == 'D' || *buf == 'S' ) {
+                               ptr=buf+1;
+                               while( *ptr && isspace(*ptr) ) ptr++;
+                               key.value=ptr;
+                               while( *ptr && !isspace(*ptr) ) ptr++;
+                               key.length = ptr-key.value;
+                               if ( db.keylen ) {
+                                       tmp=atoi(key.value);
+                                       key.length = db.keylen;
+                                       key.value=(char*)&tmp;
+                               }
+                               if ( *buf == 'D' ) {
+                                       rc=TBTDelete(&db, &key);
+                               } else if ( (rc=TBTFind(&db, &key, &value))==TBT_OK ) {
+                                       if ( db.keylen )
+                                               printf("%d", *(int*)(key.value));
+                                       else 
+                                               printLSTR(key.length, key.value);
+                                       if ( value.value ) {
+                                               fputc('\t', stdout);
+                                               printLSTR(value.length, value.value);
+                                       }
+                                       fputc('\n', stdout);
+                               }
+                               
+                       }
+               }  
+       } else {
+               TBTClose(&db);
+               usage();
+       }
+
+       if ( rc ) printf("Method returns %d\n", rc);
+
+       TBTSync(&db);
+       printf("Page read: %d (include cache hits %d)\n", db.pageread, db.cachehits);
+       printf("Page write: %d\n", db.pagewrite);
+
+       TBTClose(&db);
+
+       return 0;
+}
+
+       
+