+/*
+ * Copyright (c) 2005 Teodor Sigaev <teodor@sigaev.ru>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY CONTRIBUTORS ``AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/file.h>
+
+#include "tmalloc.h"
+#include "tlog.h"
+
+#include "tbtree.h"
+
+static int TBTReadPage(TBTree *db, u_int32_t pagenumber, TBTMemPage **page) ;
+static int TBTWritePage(TBTree *db, TBTMemPage *page) ;
+static int TBTNewPage(TBTree *db, TBTMemPage **page) ;
+
+int
+TBTOpen(TBTree *db, char *file) {
+ off_t offset;
+
+ if ( db->readonly ) {
+ db->fd = open(file, O_RDONLY);
+ if ( db->fd < 0 ) {
+ tlog(TL_CRIT,"TBTOpen: open failed: %s",strerror(errno));
+ return TBT_ERROR;
+ }
+ if ( flock( db->fd, LOCK_SH ) < 0 ) {
+ tlog(TL_CRIT,"TBTOpen: flock failed: %s",strerror(errno));
+ close(db->fd);
+ return TBT_ERROR;
+ }
+ } else {
+ db->fd = open(file, O_CREAT | O_RDWR, 0666);
+ if ( db->fd < 0 ) {
+ tlog(TL_CRIT,"TBTOpen: open failed: %s",strerror(errno));
+ return TBT_ERROR;
+ }
+ if ( flock( db->fd, LOCK_EX ) < 0 ) {
+ tlog(TL_CRIT,"TBTOpen: flock failed: %s",strerror(errno));
+ close(db->fd);
+ return TBT_ERROR;
+ }
+ }
+
+ if ( db->npage ) {
+ db->Cache=(TBTMemPage**)t0malloc( sizeof(TBTMemPage*)*db->npage );
+ db->TimeCache=(TBTMemPage**)t0malloc( sizeof(TBTMemPage*)*db->npage );
+ db->curpage=0;
+ }
+
+ db->cachehits = db->pageread = db->pagewrite = 0;
+
+ if ( (offset=lseek(db->fd, 0, SEEK_END)) < 0 ) {
+ tlog(TL_CRIT,"TBTOpen: lseek failed: %s",strerror(errno));
+ flock( db->fd, LOCK_UN );
+ close(db->fd);
+ return TBT_ERROR;
+ }
+
+ if ( offset==0 ) {
+ if ( db->readonly ) {
+ tlog(TL_CRIT,"TBTOpen: file is void");
+ flock( db->fd, LOCK_UN );
+ close(db->fd);
+ return TBT_ERROR;
+ } else {
+ TBTMemPage *page;
+ if ( TBTNewPage(db, &page) != TBT_OK ) {
+ flock( db->fd, LOCK_UN );
+ close(db->fd);
+ return TBT_ERROR;
+ }
+ page->page.isleaf=1;
+ if ( TBTWritePage(db, page)!=TBT_OK ){
+ flock( db->fd, LOCK_UN );
+ close(db->fd);
+ return TBT_ERROR;
+ }
+ }
+ }
+
+ db->lastpagenumber=0;
+ return TBT_OK;
+}
+
+int
+TBTClose(TBTree *db) {
+ int rc=0;
+
+ if ( !db->readonly )
+ rc=TBTSync(db);
+
+ if ( db->npage ) {
+ u_int32_t i;
+
+ for(i=0; i<db->curpage; i++)
+ tfree( db->Cache[i] );
+
+ tfree( db->Cache );
+ tfree( db->TimeCache );
+ }
+
+ flock( db->fd, LOCK_UN );
+ close(db->fd);
+
+ return (rc) ? TBT_ERROR : TBT_OK;
+}
+
+int
+TBTSync(TBTree *db) {
+ u_int32_t i;
+ int rc=0;
+
+ if ( db->readonly || db->npage==0 )
+ return TBT_OK;
+
+ for(i=0; i<db->curpage; i++)
+ if ( !db->Cache[i]->issynced )
+ rc |= TBTWritePage(db, db->Cache[i]);
+
+ if ( fsync(db->fd) ) {
+ tlog(TL_CRIT,"TBTSync: fsync failed: %s",strerror(errno));
+ rc=1;
+ }
+
+ return (rc) ? TBT_ERROR : TBT_OK;
+}
+
+static int
+cmpNPage(const void *a, const void *b) {
+ if ( (*(TBTMemPage**)a)->pagenumber == (*(TBTMemPage**)b)->pagenumber )
+ return 0;
+ return ( (*(TBTMemPage**)a)->pagenumber > (*(TBTMemPage**)b)->pagenumber ) ? 1 : -1;
+}
+
+static TBTMemPage*
+PageAlloc(TBTree *db, int *pos) {
+ TBTMemPage *page;
+
+ if ( db->curpage < db->npage ) {
+ page = db->Cache[db->curpage] = db->TimeCache[db->curpage] = (TBTMemPage*)t0malloc(sizeof(TBTMemPage));
+ *pos = db->curpage;
+ page->iscached=1;
+ db->curpage++;
+ } else if ( db->npage == 0 ) {
+ page = (TBTMemPage*)t0malloc(sizeof(TBTMemPage));
+ } else {
+ u_int32_t i;
+ for( i=0; i<db->curpage && db->TimeCache[i]->islocked; i++ );
+ if ( i == db->curpage ) { /*all pages locked*/
+ /*tlog(TL_WARN,"TBTReadPage: all pages in cache locked, increase number of cached pages");*/
+ page = (TBTMemPage*)t0malloc(sizeof(TBTMemPage));
+ } else {
+ page = db->TimeCache[i];
+ if ( !page->issynced )
+ if ( TBTWritePage(db, page) )
+ return NULL;
+ memset(page,0,sizeof(TBTMemPage));
+ page->iscached=1;
+ *pos=i;
+ }
+ }
+
+ return page;
+}
+
+static void
+moveFirstTime(TBTree *db, int pos) {
+ TBTMemPage *ptr;
+
+ if ( pos < 0 )
+ return;
+ if (pos+1==db->curpage )
+ return;
+
+ ptr = db->TimeCache[pos];
+ memmove( db->TimeCache + pos , db->TimeCache + pos + 1, sizeof(TBTMemPage*)*(db->curpage-pos-1) );
+ db->TimeCache[db->curpage-1] = ptr;
+}
+
+static void
+findAndMoveFirstTime(TBTree *db, TBTMemPage *page) {
+ int pos;
+ for(pos=0;pos<db->curpage;pos++)
+ if ( page==db->TimeCache[pos] ) {
+ moveFirstTime(db,pos);
+ break;
+ }
+ tassert( pos<db->curpage );
+}
+
+static void
+findAndMovePN(TBTree *db, TBTMemPage *page) {
+ int pos;
+
+ if ( db->curpage < 2 )
+ return;
+
+ for(pos=0;pos<db->curpage;pos++)
+ if ( page==db->Cache[pos] )
+ break;
+ tassert( pos<db->curpage );
+
+ while(1) {
+ if ( pos+1 != db->curpage && db->Cache[pos+1]->pagenumber < page->pagenumber ) {
+ db->Cache[pos] = db->Cache[pos+1];
+ db->Cache[pos+1] = page;
+ pos++;
+ } else if ( pos!=0 && db->Cache[pos-1]->pagenumber > page->pagenumber ) {
+ db->Cache[pos] = db->Cache[pos-1];
+ db->Cache[pos-1] = page;
+ pos--;
+ } else
+ break;
+ }
+}
+
+static int
+TBTReadPage(TBTree *db, u_int32_t pagenumber, TBTMemPage **page) {
+ TBTMemPage *ptr, key, **res;
+ int rc;
+ int pos;
+
+ ptr=&key;
+ key.pagenumber = pagenumber;
+
+ res = (TBTMemPage**)bsearch(&ptr, db->Cache, db->curpage, sizeof(TBTMemPage*), cmpNPage);
+ if ( res ) {
+ db->cachehits++;
+ *page=*res;
+ findAndMoveFirstTime(db, *page);
+ } else {
+ off_t offset = (off_t)pagenumber * (off_t)TBTREEPAGESIZE;
+
+ if( (*page = PageAlloc(db,&pos))==NULL )
+ return TBT_ERROR;
+
+ if ( lseek(db->fd, offset, SEEK_SET) != offset ) {
+ tlog(TL_CRIT,"TBTReadPage: lseek failed: %s",strerror(errno));
+ return TBT_ERROR;
+ }
+ (*page)->issynced=1;
+ (*page)->pagenumber=pagenumber;
+ if ( (rc=read(db->fd, &( (*page)->page ), TBTREEPAGESIZE)) != TBTREEPAGESIZE ) {
+ tlog(TL_CRIT,"TBTReadPage: read failed: %s %d %d",strerror(errno), pagenumber, offset);
+ return TBT_ERROR;
+ }
+ if ( (*page)->iscached ) {
+ moveFirstTime(db, pos);
+ findAndMovePN(db, *page);
+ }
+ }
+
+ db->pageread++;
+ gettimeofday( &( (*page)->last ), NULL );
+
+ return TBT_OK;
+}
+
+static int
+TBTWritePage(TBTree *db, TBTMemPage *page) {
+ off_t offset = (off_t)(page->pagenumber) * (off_t)TBTREEPAGESIZE;
+ off_t size;
+
+ if ( page->issynced )
+ return TBT_OK;
+
+ if ( lseek(db->fd, offset, SEEK_SET) != offset ) {
+ tlog(TL_CRIT,"TBTWritePage: lseek failed: %s",strerror(errno));
+ return TBT_ERROR;
+ }
+
+ if ( (size=write(db->fd, &( page->page ), TBTREEPAGESIZE)) != TBTREEPAGESIZE ) {
+ tlog(TL_CRIT,"TBTWritePage: write failed: %s (writed only %d bytes)",strerror(errno), size);
+ return TBT_ERROR;
+ }
+
+ page->issynced=1;
+ db->pagewrite++;
+
+ return TBT_OK;
+}
+
+static int
+TBTNewPage(TBTree *db, TBTMemPage **page) {
+ int pos;
+ if ( db->lastpagenumber==0 ) {
+ off_t offset;
+ if ( (offset=lseek(db->fd, 0, SEEK_END)) < 0 ) {
+ tlog(TL_CRIT,"TBTNewPage: lseek failed: %s",strerror(errno));
+ return TBT_ERROR;
+ }
+
+ if ( offset % TBTREEPAGESIZE != 0 ) {
+ tlog(TL_CRIT,"TBTNewPage: file corrupted");
+ return TBT_ERROR;
+ }
+
+ db->lastpagenumber = offset/TBTREEPAGESIZE;
+ }
+
+ if( (*page = PageAlloc(db,&pos))==NULL )
+ return TBT_ERROR;
+ (*page)->pagenumber = db->lastpagenumber;
+ (*page)->issynced=0;
+ (*page)->islocked=1;
+
+ db->lastpagenumber++;
+
+ memset( &((*page)->page), 0, sizeof(TBTPAGEHDRSZ));
+ (*page)->page.freespace = TBTREEPAGESIZE-TBTPAGEHDRSZ;
+
+ gettimeofday( &( (*page)->last ), NULL );
+ if ( (*page)->iscached ) {
+ moveFirstTime(db, pos);
+ findAndMovePN(db, *page);
+ }
+
+ return TBT_OK;
+}
+
+#define NOTFOUND 0
+#define FOUND 1
+
+static void
+printValue(u_int32_t len, char *ptr) {
+ putchar('\'');
+ while(len>0) {
+ putchar(*ptr);
+ ptr++; len--;
+ }
+ putchar('\'');
+}
+
+static void
+dumpPage(TBTree *db, TBTPage *page, int follow) {
+ u_int32_t i,j;
+
+ for(j=0;j<follow;j++)
+ putchar(' ');
+ printf("Page: f:%d l:%d r:%d n:%d\n",
+ page->freespace,
+ page->isleaf,
+ page->rightlink,
+ page->npointer
+ );
+ for(i=0;i<page->npointer;i++) {
+ TBTPointer *ptr= (TBTPointer*)(page->data+TBTPOINTERSIZE(db)*i);
+ for(j=0;j<follow+2;j++)
+ putchar(' ');
+ printf("i:%d(%d) kl:%d ko:%d ",
+ i, (int)ptr,
+ (db->keylen) ? db->keylen : ptr->key.varlen.length,
+ (db->keylen) ? 0 : ptr->key.varlen.offset
+ );
+ if ( db->keylen==0 )
+ printValue(ptr->key.varlen.length, page->data + ptr->key.varlen.offset);
+ else
+ printf("'%d'", *(int*)(ptr->key.fixed.key));
+ if ( page->isleaf ) {
+ printf(" vl:%d vo:%d ",
+ ptr->pointer.leaf.length,
+ ptr->pointer.leaf.offset
+ );
+ printValue(ptr->pointer.leaf.length, page->data + ptr->pointer.leaf.offset );
+ putchar('\n');
+ } else {
+ printf(" page:%d\n", ptr->pointer.internal.link);
+ if ( follow>=0 )
+ dumpTree(db, ptr->pointer.internal.link, follow+4);
+ }
+ }
+}
+
+void
+dumpTree(TBTree *db, u_int32_t pagenumber, int follow) {
+ TBTMemPage *page;
+
+ if ( TBTReadPage(db, pagenumber, &page) ) {
+ fprintf(stderr,"BEDA %d\n", pagenumber);
+ return;
+ }
+
+ page->islocked=1;
+ dumpPage(db, &(page->page), follow);
+ page->islocked=0;
+ if ( !page->iscached )
+ tfree(page);
+}
+
+static int
+findInPage(TBTree *db, TBTPage *page, TBTValue *key, TBTPointer **res) {
+ TBTPointer *StopLow = (TBTPointer*)(page->data);
+ TBTPointer *StopHigh = (TBTPointer*)(page->data + TBTPOINTERSIZE(db)*page->npointer);
+ TBTPointer *StopMiddle;
+ int diff ;
+ TBTValue A;
+
+ /* Loop invariant: StopLow <= StopMiddle < StopHigh */
+ while (StopLow < StopHigh) {
+ StopMiddle = (TBTPointer*)(((char*)StopLow) + TBTPOINTERSIZE(db)*((((char*)StopHigh - (char*)StopLow)/TBTPOINTERSIZE(db)) >> 1));
+ if ( db->keylen == 0 ) {
+ A.length = StopMiddle->key.varlen.length;
+ A.value = page->data + StopMiddle->key.varlen.offset;
+ } else {
+ A.length = db->keylen;
+ A.value = StopMiddle->key.fixed.key;
+ }
+
+ if ( ISINFPOINTER(db, page, StopMiddle) )
+ diff = 1;
+ else
+ diff = db->cmpkey( &A, key );
+ if ( diff == 0 ) {
+ *res = StopMiddle;
+ return FOUND;
+ } else if ( diff < 0 )
+ StopLow = (TBTPointer*)((char*)StopMiddle + TBTPOINTERSIZE(db));
+ else
+ StopHigh = StopMiddle;
+ }
+
+ *res = StopHigh;
+ return NOTFOUND;
+}
+
+static int
+findPage(TBTree *db, TBTValue *key, TBTMemPage **page) {
+ u_int32_t pagenumber = TBTPAGEROOT;
+ int rc = TBT_OK;
+
+ *page=NULL;
+ while(1) {
+ TBTPointer *ptr;
+
+ if ( (rc=TBTReadPage(db, pagenumber, page))!=TBT_OK )
+ return rc;
+
+ if ( (*page)->page.isleaf )
+ return TBT_OK;
+
+ findInPage(db, &((*page)->page), key, &ptr );
+ pagenumber = ptr->pointer.internal.link;
+ if ( !(*page)->iscached )
+ tfree(*page);
+ }
+
+ return TBT_OK;
+}
+
+int
+TBTFind(TBTree *db, TBTValue *key, TBTValue *value) {
+ TBTMemPage *page;
+ TBTPointer *ptr;
+ int rc;
+
+ memset(value, 0, sizeof(TBTValue));
+
+ if ( (rc=findPage(db, key, &page))!=TBT_OK )
+ return rc;
+
+ if ( page == NULL )
+ return TBT_OK;
+
+ if ( findInPage(db, &(page->page), key, &ptr) != FOUND ) {
+ if ( !(page)->iscached )
+ tfree(page);
+ return TBT_OK;
+ }
+
+ value->length = ptr->pointer.leaf.length;
+ value->value = tmalloc(value->length);
+ memcpy( value->value, page->page.data + ptr->pointer.leaf.offset, value->length );
+
+ if ( !(page)->iscached )
+ tfree(page);
+
+ return TBT_OK;
+}
+
+static void
+packLeafKV(TBTree *db, TBTPage *page, TBTPointer *ptr, TBTValue *key, TBTValue *value) {
+ if ( ptr ) {
+ if ( (char*)ptr - page->data != page->npointer * TBTPOINTERSIZE(db) )
+ memmove((char*)ptr+TBTPOINTERSIZE(db), ptr, page->npointer * TBTPOINTERSIZE(db) - ( (char*)ptr - page->data ));
+ } else
+ ptr = (TBTPointer*)(page->data + page->npointer * TBTPOINTERSIZE(db));
+
+ page->npointer++;
+ page->freespace -= TBTPOINTERSIZE(db) + PTRALIGN(value->length);
+ if ( db->keylen ) {
+ memcpy( ptr->key.fixed.key, key->value, db->keylen );
+ ptr->pointer.leaf.offset = page->npointer * TBTPOINTERSIZE(db) + page->freespace;
+ } else {
+ page->freespace -= PTRALIGN(key->length);
+ ptr->key.varlen.length = key->length;
+ ptr->key.varlen.offset = page->npointer * TBTPOINTERSIZE(db) + page->freespace;
+ memcpy( page->data + ptr->key.varlen.offset, key->value, key->length);
+ ptr->pointer.leaf.offset = page->npointer * TBTPOINTERSIZE(db) + page->freespace + PTRALIGN(key->length);
+ }
+ ptr->pointer.leaf.length = value->length;
+ memcpy( page->data + ptr->pointer.leaf.offset, value->value, value->length );
+}
+
+static void
+packInternalKV(TBTree *db, TBTPage *page, TBTPointer *ptr, TBTValue *key, u_int32_t pagenumber) {
+ if ( ptr ) {
+ if ( (char*)ptr - page->data != page->npointer * TBTPOINTERSIZE(db) )
+ memmove((char*)ptr+TBTPOINTERSIZE(db), ptr, page->npointer * TBTPOINTERSIZE(db) - ( (char*)ptr - page->data ));
+ } else
+ ptr = (TBTPointer*)(page->data + page->npointer * TBTPOINTERSIZE(db));
+
+ page->npointer++;
+ page->freespace -= TBTPOINTERSIZE(db);
+
+ if ( db->keylen ) {
+ if ( key->value )
+ memcpy( ptr->key.fixed.key, key->value, db->keylen );
+ else
+ memset( ptr->key.fixed.key, 0, db->keylen );
+ } else {
+ page->freespace -= PTRALIGN(key->length);
+ ptr->key.varlen.length = key->length;
+ ptr->key.varlen.offset = page->npointer * TBTPOINTERSIZE(db) + page->freespace;
+ if ( key->length )
+ memcpy( page->data + ptr->key.varlen.offset, key->value, key->length);
+ }
+ ptr->pointer.internal.link = pagenumber;
+}
+
+
+static void
+deleteKV(TBTree *db, TBTPage *page, TBTPointer **ptrs, u_int32_t num) {
+ static char buf[TBTREEPAGESIZE];
+ char *bufptr = buf;
+ TBTPointer *ptr = (TBTPointer*)page->data, *tmp, **ptrptr = ptrs;
+ u_int32_t i, start;
+ /* suppose ptrs are ordered! */
+ tmp = ptr;
+ while( ptrptr - ptrs < num && (char*)ptr - page->data < TBTPOINTERSIZE(db)*page->npointer ) {
+ if ( *ptrptr < ptr ) {
+ tlog(TL_CRIT,"deleteKV: Something wrong: unexistent *ptrs");
+ ptrptr++;
+ } else if ( *ptrptr > ptr ) {
+ if ( tmp != ptr )
+ memcpy(tmp, ptr, TBTPOINTERSIZE(db));
+ ptr = (TBTPointer*)( (char*)ptr + TBTPOINTERSIZE(db) );
+ tmp = (TBTPointer*)( (char*)tmp + TBTPOINTERSIZE(db) );
+ } else {
+ page->freespace += TBTPOINTERSIZE(db) +
+ ( ( page->isleaf ) ? PTRALIGN(ptr->pointer.leaf.length) : 0 ) +
+ ( ( db->keylen ) ? 0 : PTRALIGN(ptr->key.varlen.length) );
+ ptrptr++;
+ ptr = (TBTPointer*)( (char*)ptr + TBTPOINTERSIZE(db) );
+ }
+ }
+
+ if ( (char*)ptr - page->data < TBTPOINTERSIZE(db)*page->npointer && tmp!=ptr )
+ memmove( tmp, ptr, page->npointer * TBTPOINTERSIZE(db) - ((char*)ptr - page->data) );
+
+ page->npointer-=num;
+
+ tmp = (TBTPointer*)page->data;
+ start = page->npointer * TBTPOINTERSIZE(db) + page->freespace;
+ for(i=0;i<page->npointer;i++) {
+ if ( page->isleaf ) {
+ memcpy( bufptr, page->data + tmp->pointer.leaf.offset, tmp->pointer.leaf.length );
+ tmp->pointer.leaf.offset = start + (bufptr-buf);
+ bufptr+=PTRALIGN(tmp->pointer.leaf.length);
+ }
+
+ if ( db->keylen == 0 ) {
+ if ( tmp->key.varlen.length )
+ memcpy( bufptr, page->data + tmp->key.varlen.offset, tmp->key.varlen.length );
+ tmp->key.varlen.offset = start + (bufptr-buf);
+ bufptr+=PTRALIGN(tmp->key.varlen.length);
+ }
+ tmp = (TBTPointer*)( (char*)tmp + TBTPOINTERSIZE(db) );
+ }
+
+ memcpy( page->data + start, buf, bufptr-buf );
+}
+
+static u_int32_t
+findDelimiter(TBTree *db, TBTPage *page, TBTPointer *ptr, int size, u_int32_t start, int *where) {
+ int *sizes = (int*) tmalloc( sizeof(int) * (page->npointer+1) );
+ int i;
+ TBTPointer *tomove;
+ int lfree=TBTREEPAGESIZE-TBTPAGEHDRSZ, rfree=TBTREEPAGESIZE-TBTPAGEHDRSZ;
+ int nptr = ( (char*)ptr - page->data ) / TBTPOINTERSIZE(db);
+ int was=0;
+
+ for(i=0; i<page->npointer;i++) {
+ tomove = (TBTPointer*)(page->data + i*TBTPOINTERSIZE(db));
+ if ( was==0 && i==nptr ) {
+ sizes[i] = size;
+ was=1;
+ }
+ sizes[i+was] = TBTPOINTERSIZE(db) +
+ ( (page->isleaf) ? PTRALIGN(tomove->pointer.leaf.length) : 0 ) +
+ ( ( db->keylen ) ? 0 : PTRALIGN(tomove->key.varlen.length) );
+
+ }
+
+ if ( was==0 )
+ sizes[page->npointer]=size;
+
+ for(i=0;i<page->npointer+1;i++) {
+ if ( i< start )
+ lfree-=sizes[i];
+ else
+ rfree-=sizes[i];
+ }
+
+ while( 1 ) {
+ if ( lfree<0 ) {
+ lfree+=sizes[start];
+ rfree-=sizes[start];
+ start--;
+ } else if ( rfree < 0 ) {
+ lfree-=sizes[start];
+ rfree+=sizes[start];
+ start++;
+ } else
+ break;
+ }
+
+ *where = (nptr < start) ? -1 : 1;
+ if ( start>nptr )
+ start--;
+
+ tfree(sizes);
+ return start;
+}
+
+static void
+populatePages(TBTree *db, TBTPage *srcpage, TBTPage* newpage, u_int32_t start) {
+ u_int32_t i;
+ TBTPointer *tomove, **todelete = tmalloc( sizeof(TBTPointer*)* srcpage->npointer );
+ u_int32_t numtodelete=0;
+ TBTValue k, v;
+
+ for(i=start;i<srcpage->npointer;i++) {
+ tomove = (TBTPointer*)(srcpage->data + i*TBTPOINTERSIZE(db));
+
+ todelete[numtodelete] = tomove;
+ numtodelete++;
+
+ if ( db->keylen )
+ k.value = tomove->key.fixed.key;
+ else {
+ k.length = tomove->key.varlen.length;
+ k.value = srcpage->data + tomove->key.varlen.offset;
+ }
+
+ if ( srcpage->isleaf ) {
+ v.length = tomove->pointer.leaf.length;
+ v.value = srcpage->data + tomove->pointer.leaf.offset;
+ packLeafKV(db, newpage, NULL, &k, &v);
+ } else
+ packInternalKV(db, newpage, NULL, &k, tomove->pointer.internal.link);
+ }
+
+ deleteKV(db, srcpage, todelete, numtodelete);
+ tfree(todelete);
+}
+
+static int
+splitPage(TBTree *db, TBTMemPage *srcpage, TBTMemPage** newpage, TBTPointer **ptr, int size, TBTValue *key, TBTValue *value, u_int32_t pagenumber) {
+ TBTMemPage *tmp;
+ int start=0, where=0;
+ int nptr = ( (char*)(*ptr) - srcpage->page.data ) / TBTPOINTERSIZE(db);
+
+ if ( TBTNewPage(db, newpage) )
+ return TBT_ERROR;
+
+ tmp = *newpage;
+ srcpage->issynced=tmp->issynced=0;
+ tmp->islocked=1;
+ tmp->page.rightlink = srcpage->page.rightlink;
+ srcpage->page.rightlink = tmp->pagenumber;
+ tmp->page.isleaf = srcpage->page.isleaf;
+
+ switch(db->strategy) {
+ case TBT_STATEGY_HALF:
+ start = findDelimiter(db, &(srcpage->page), *ptr, size, (srcpage->page.npointer+1)/2, &where);
+ break;
+ case TBT_STATEGY_LEFTFILL:
+ start = findDelimiter(db, &(srcpage->page), *ptr, size, srcpage->page.npointer, &where);
+ break;
+ case TBT_STATEGY_RIGHTFILL:
+ start = findDelimiter(db, &(srcpage->page), *ptr, size, 0, &where);
+ break;
+ default:
+ tlog(TL_CRIT,"splitPage: unknown strategy: %d", db->strategy);
+ return TBT_ERROR;
+ }
+
+ populatePages(db, &(srcpage->page), &(tmp->page), start);
+ if ( where<0 )
+ tmp=srcpage;
+ else
+ nptr-=start;
+ *ptr = (TBTPointer*)(tmp->page.data + nptr*TBTPOINTERSIZE(db));
+
+ if ( tmp->page.isleaf )
+ packLeafKV(db, &(tmp->page), *ptr, key, value);
+ else
+ packInternalKV(db, &(tmp->page), *ptr, key, pagenumber);
+
+ return TBT_OK;
+}
+
+static int
+savePage(TBTree *db, TBTMemPage *page) {
+ int rc;
+ page->islocked=0;
+
+ if ( !page->iscached ) {
+ if ( page->issynced==0 && (rc=TBTWritePage(db, page)) != TBT_OK )
+ return rc;
+ tfree(page);
+ }
+ return TBT_OK;
+}
+
+static int
+layerInsert(TBTree *db, u_int32_t pagenumber, TBTMemPage **left, TBTMemPage **right, TBTValue *key, TBTValue *value) {
+ int rc,fnd;
+ TBTPointer *ptr=NULL;
+ TBTMemPage *page=NULL, *newright=NULL;
+
+ if ( (rc=TBTReadPage(db, pagenumber, &page))!=TBT_OK )
+ return rc;
+
+ page->islocked=1;
+
+ fnd = findInPage(db, &(page->page), key, &ptr );
+
+ *left=*right=NULL;
+ if ( page->page.isleaf ) {
+ u_int32_t size = TBTPOINTERSIZE(db) +
+ PTRALIGN(value->length) +
+ ( ( db->keylen ) ? 0 : PTRALIGN(key->length) );
+
+ if ( size>(TBTREEPAGESIZE-TBTPAGEHDRSZ)/2 ) {
+ tlog(TL_ALARM,"Huge size: %d bytes", size);
+ return TBT_HUGE;
+ }
+
+ if (fnd==FOUND) {
+ deleteKV(db, &(page->page), &ptr, 1);
+ page->issynced=0;
+ }
+
+ if ( size <= page->page.freespace ) {
+ packLeafKV(db, &(page->page), ptr, key, value);
+ page->issynced=0;
+ } else {
+ rc = splitPage(db, page, &newright, &ptr, size, key, value, 0);
+ if ( rc != TBT_OK ) return rc;
+
+ page->issynced = newright->issynced = 0;
+ page->islocked = newright->islocked = 1;
+ }
+ } else {
+ if ( (rc=layerInsert(db, ptr->pointer.internal.link, left, right, key, value))!=TBT_OK ) {
+ page->islocked=0;
+ return rc;
+ }
+
+ if ( *left ) { /* child page was splited */
+ u_int32_t size;
+ TBTPointer *rightptr = (TBTPointer*)( (*left)->page.data + ((*left)->page.npointer-1) * TBTPOINTERSIZE(db) );
+ TBTValue K;
+
+ if ( ISINFPOINTER(db, &(page->page), ptr) ) {
+ deleteKV(db, &(page->page), &ptr, 1);
+ K.length = 0; K.value = NULL;
+ packInternalKV(db, &(page->page), ptr, &K, (*right)->pagenumber);
+ } else
+ ptr->pointer.internal.link=(*right)->pagenumber;
+
+ K.length = ( db->keylen ) ? db->keylen : rightptr->key.varlen.length;
+ K.value = ( db->keylen ) ? rightptr->key.fixed.key : (*left)->page.data + rightptr->key.varlen.offset;
+
+ size = TBTPOINTERSIZE(db) + ( ( db->keylen ) ? 0 : PTRALIGN(K.length) );
+
+ if ( size <= page->page.freespace ) {
+ packInternalKV(db, &(page->page), ptr, &K, (*left)->pagenumber);
+ page->issynced=0;
+ } else {
+ rc = splitPage(db, page, &newright, &ptr, size, &K, NULL, (*left)->pagenumber);
+ if ( rc != TBT_OK ) return rc;
+
+ page->issynced = newright->issynced = 0;
+ page->islocked = newright->islocked = 1;
+ }
+ }
+ }
+
+ if ( *left ) {
+ if ( (rc=savePage(db, *left))!=TBT_OK )
+ return rc;
+ if ( (rc=savePage(db, *right))!=TBT_OK )
+ return rc;
+ *left=*right=NULL;
+ }
+
+ if ( newright ) {
+ if ( page->pagenumber==TBTPAGEROOT ) { /* root was splited */
+ TBTMemPage *newleft;
+ TBTValue K;
+
+
+ if ( (rc=TBTNewPage(db,&newleft))!=TBT_OK )
+ return rc;
+ memcpy( &(newleft->page) , &(page->page), TBTREEPAGESIZE);
+ memset( &(page->page), 0, TBTREEPAGESIZE);
+ page->page.freespace=TBTREEPAGESIZE-TBTPAGEHDRSZ;
+
+ ptr = (TBTPointer*)( newleft->page.data + (newleft->page.npointer-1) * TBTPOINTERSIZE(db) );
+ K.length = ( db->keylen ) ? db->keylen : ptr->key.varlen.length;
+ K.value = ( db->keylen ) ? ptr->key.fixed.key : newleft->page.data + ptr->key.varlen.offset;
+ packInternalKV(db, &(page->page), NULL, &K, newleft->pagenumber);
+
+ ptr = (TBTPointer*)( newright->page.data + (newright->page.npointer-1) * TBTPOINTERSIZE(db) );
+ K.length = 0;
+ K.value = NULL;
+ packInternalKV(db, &(page->page), NULL, &K, newright->pagenumber);
+
+ if ( (rc=savePage(db, newright))!=TBT_OK )
+ return rc;
+ if ( (rc=savePage(db, newleft))!=TBT_OK )
+ return rc;
+ if ( (rc=savePage(db, page))!=TBT_OK )
+ return rc;
+ } else {
+ *left = page;
+ *right = newright;
+ }
+ } else if ( (rc=savePage(db, page))!=TBT_OK )
+ return rc;
+
+
+ return TBT_OK;
+}
+
+int
+TBTInsert(TBTree *db, TBTValue *key, TBTValue *value) {
+ int rc;
+ TBTMemPage *lpage=NULL, *rpage=NULL;
+
+ rc = layerInsert(db, TBTPAGEROOT, &lpage, &rpage, key, value);
+
+ tassert( lpage==NULL && rpage==NULL );
+
+ return rc;
+}
+
+int
+TBTDelete(TBTree *db, TBTValue *key) {
+ TBTMemPage *page;
+ TBTPointer *ptr;
+ int rc;
+
+ if ( (rc=findPage(db, key, &page))!=TBT_OK )
+ return rc;
+
+ if ( page == NULL )
+ return TBT_OK;
+
+ if ( findInPage(db, &(page->page), key, &ptr) != FOUND ) {
+ if ( !(page)->iscached )
+ tfree(page);
+ return TBT_OK;
+ }
+
+ deleteKV(db, &(page->page), &ptr, 1);
+ page->issynced=0;
+
+ rc=savePage(db, page);
+
+ return rc;
+}
+
+int
+TBTInitIterator(TBTree *db, TBTIterator *iterator) {
+ TBTMemPage *page=NULL;
+ TBTPointer *ptr;
+ int rc;
+ u_int32_t pagenum=TBTPAGEROOT;
+
+ memset(iterator, 0, sizeof(TBTIterator));
+
+ do {
+ if ( page && !page->iscached )
+ tfree(page);
+ if ( (rc=TBTReadPage(db, pagenum, &page))!=TBT_OK )
+ return rc;
+
+ ptr = (TBTPointer*)( page->page.data );
+ pagenum = ptr->pointer.internal.link;
+ } while( !page->page.isleaf );
+
+ page->islocked=1;
+ iterator->page = page;
+ iterator->ptr = ptr;
+
+ return TBT_OK;
+}
+
+void
+TBTFreeIterator(TBTree *db, TBTIterator *iterator) {
+
+ if ( iterator->page ) {
+ iterator->page->islocked=0;
+ if ( !iterator->page->iscached )
+ tfree(iterator->page);
+ }
+
+ memset(iterator, 0, sizeof(TBTIterator));
+}
+
+int
+TBTIterate(TBTree *db, TBTIterator *iterator, TBTValue *key, TBTValue *value ) {
+ int rc;
+
+ if ( iterator->page==NULL ) {
+ memset(key, 0, sizeof(TBTValue));
+ memset(value, 0, sizeof(TBTValue));
+ return TBT_OK;
+ }
+
+ if ( (char*)(iterator->ptr) - iterator->page->page.data >= TBTPOINTERSIZE(db) * iterator->page->page.npointer ) {
+ do {
+ u_int32_t pagenumber = iterator->page->page.rightlink;
+ iterator->page->islocked=0;
+ if ( !iterator->page->iscached )
+ tfree(iterator->page);
+ iterator->page=NULL;
+
+ if ( pagenumber==TBTPAGEROOT ) {
+ memset(key, 0, sizeof(TBTValue));
+ memset(value, 0, sizeof(TBTValue));
+ return TBT_OK;
+ }
+ if ( (rc=TBTReadPage(db, pagenumber, &(iterator->page)))!=TBT_OK )
+ return rc;
+ } while ( iterator->page->page.npointer == 0 );
+
+ iterator->page->islocked=1;
+ iterator->ptr = (TBTPointer*)( iterator->page->page.data );
+ }
+
+ if ( db->keylen ) {
+ key->length = db->keylen;
+ key->value = iterator->ptr->key.fixed.key;
+ } else {
+ key->length = iterator->ptr->key.varlen.length;
+ key->value = iterator->page->page.data + iterator->ptr->key.varlen.offset;
+ }
+
+ value->length = iterator->ptr->pointer.leaf.length;
+ value->value = iterator->page->page.data + iterator->ptr->pointer.leaf.offset;
+
+ iterator->ptr = ( TBTPointer* ) ( (char*)(iterator->ptr) + TBTPOINTERSIZE(db) );
+
+ return TBT_OK;
+}
+
+