2 * Copyright (c) 2005 Teodor Sigaev <teodor@sigaev.ru>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the author nor the names of any co-contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY CONTRIBUTORS ``AS IS'' AND ANY EXPRESS
18 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL CONTRIBUTORS BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
25 * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
27 * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 #include <sys/types.h>
47 static int TBTReadPage(TBTree *db, u_int32_t pagenumber, TBTMemPage **page) ;
48 static int TBTWritePage(TBTree *db, TBTMemPage *page) ;
49 static int TBTNewPage(TBTree *db, TBTMemPage **page) ;
54 TBTOpen(TBTree *db, char *file) {
58 db->fd = open(file, O_RDONLY);
60 tlog(TL_CRIT,"TBTOpen: open failed: %s",strerror(errno));
63 if ( flock( db->fd, LOCK_SH ) < 0 ) {
64 tlog(TL_CRIT,"TBTOpen: flock failed: %s",strerror(errno));
69 db->fd = open(file, O_CREAT | O_RDWR, 0666);
71 tlog(TL_CRIT,"TBTOpen: open failed: %s",strerror(errno));
74 if ( flock( db->fd, LOCK_EX ) < 0 ) {
75 tlog(TL_CRIT,"TBTOpen: flock failed: %s",strerror(errno));
82 db->Cache=(TBTMemPage**)t0malloc( sizeof(TBTMemPage*)*HASHSIZE(db->npage) );
86 db->cachehits = db->pageread = db->pagewrite = 0;
88 if ( (offset=lseek(db->fd, 0, SEEK_END)) < 0 ) {
89 tlog(TL_CRIT,"TBTOpen: lseek failed: %s",strerror(errno));
90 flock( db->fd, LOCK_UN );
97 tlog(TL_CRIT,"TBTOpen: file is void");
98 flock( db->fd, LOCK_UN );
103 if ( TBTNewPage(db, &page) != TBT_OK ) {
104 flock( db->fd, LOCK_UN );
109 if ( TBTWritePage(db, page)!=TBT_OK ){
110 flock( db->fd, LOCK_UN );
117 db->pointersize = TBTPOINTERSIZE(db);
118 db->lastpagenumber=0;
123 TBTClose(TBTree *db) {
131 TBTMemPage *ptr,*tmp;
133 for(i=0; i<HASHSIZE(db->npage); i++) {
145 flock( db->fd, LOCK_UN );
148 return (rc) ? TBT_ERROR : TBT_OK;
152 cmpNPage(const void *a, const void *b) {
153 tassert( (*(TBTMemPage**)a)->pagenumber != (*(TBTMemPage**)b)->pagenumber );
154 return ( (*(TBTMemPage**)a)->pagenumber > (*(TBTMemPage**)b)->pagenumber ) ? 1 : -1;
159 TBTSync(TBTree *db) {
162 if ( db->readonly || db->npage==0 )
166 u_int32_t i, total=0;
167 TBTMemPage *ptr, **data = (TBTMemPage**)tmalloc(db->npage*sizeof(TBTMemPage*)), **pptr;
169 for(i=0; i<HASHSIZE(db->npage); i++) {
172 if ( !ptr->issynced ) {
183 qsort( data, total, sizeof(TBTMemPage*), cmpNPage);
186 while( pptr-data<total) {
187 rc |= TBTWritePage(db, *pptr);
194 if ( fsync(db->fd) ) {
195 tlog(TL_CRIT,"TBTSync: fsync failed: %s",strerror(errno));
199 return (rc) ? TBT_ERROR : TBT_OK;
203 findInCache(TBTree *db, u_int32_t pagenumber) {
206 ptr = db->Cache[ pagenumber % HASHSIZE(db->npage) ];
208 while(ptr && ptr->pagenumber != pagenumber)
215 removeFromCache(TBTree *db, u_int32_t pagenumber) {
216 TBTMemPage *ptr, *prev=NULL;
217 int pos = pagenumber % HASHSIZE(db->npage);
219 ptr = db->Cache[ pos ];
222 if ( ptr->pagenumber == pagenumber ) {
224 prev->link = ptr->link;
226 db->Cache[ pos ] = ptr->link;
235 insertInCache(TBTree *db, TBTMemPage *page) {
236 int pos = page->pagenumber % HASHSIZE(db->npage);
238 page->link = db->Cache[ pos ];
239 db->Cache[ pos ] = page;
243 PageAlloc(TBTree *db) {
246 if ( db->curpage < db->npage ) {
247 page = (TBTMemPage*)tmalloc(sizeof(TBTMemPage));
248 memset( page, 0, TBTMEMPAGEHDRSZ );
250 if ( db->curpage == 0 ) {
251 db->TimeCache = db->TimeCacheLast = page;
253 db->TimeCacheLast->next = page;
254 page->prev = db->TimeCacheLast;
255 db->TimeCacheLast = page;
258 } else if ( db->npage == 0 ) {
259 page = (TBTMemPage*)tmalloc(sizeof(TBTMemPage));
260 memset( page, 0, TBTMEMPAGEHDRSZ );
262 page = db->TimeCache;
263 while( page->next && page->islocked )
265 if ( page==db->TimeCacheLast && page->islocked ) { /*all pages locked*/
266 /* tlog(TL_WARN,"TBTReadPage: all pages in cache locked, increase number of cached pages"); */
267 page = (TBTMemPage*)tmalloc(sizeof(TBTMemPage));
268 memset( page, 0, TBTMEMPAGEHDRSZ );
270 if ( !page->issynced )
271 if ( TBTWritePage(db, page) )
273 removeFromCache(db, page->pagenumber);
274 if ( page != db->TimeCacheLast ) {
275 if ( page == db->TimeCache ) {
276 db->TimeCache = page->next;
277 db->TimeCache->prev = NULL;
279 page->prev->next = page->next;
280 page->next->prev = page->prev;
282 memset( page, 0, TBTMEMPAGEHDRSZ );
283 db->TimeCacheLast->next = page;
284 page->prev = db->TimeCacheLast;
285 db->TimeCacheLast = page;
287 memset( page, 0, TBTMEMPAGEHDRSZ );
296 TBTReadPage(TBTree *db, u_int32_t pagenumber, TBTMemPage **page) {
299 if ( db->npage && (*page=findInCache(db, pagenumber))!=NULL ) {
301 if ( *page != db->TimeCacheLast ) {
302 if ( (*page) == db->TimeCache ) {
303 db->TimeCache = (*page)->next;
304 db->TimeCache->prev=NULL;
306 (*page)->prev->next = (*page)->next;
307 (*page)->next->prev = (*page)->prev;
309 db->TimeCacheLast->next = (*page);
310 (*page)->prev = db->TimeCacheLast;
311 db->TimeCacheLast = *page;
312 (*page)->next = NULL;
315 off_t offset = (off_t)pagenumber * (off_t)TBTREEPAGESIZE;
317 if( (*page = PageAlloc(db))==NULL )
320 if ( lseek(db->fd, offset, SEEK_SET) != offset ) {
321 tlog(TL_CRIT,"TBTReadPage: lseek failed: %s",strerror(errno));
325 (*page)->pagenumber=pagenumber;
326 if ( (rc=read(db->fd, &( (*page)->page ), TBTREEPAGESIZE)) != TBTREEPAGESIZE ) {
327 tlog(TL_CRIT,"TBTReadPage: read failed: %s %d %d",strerror(errno), pagenumber, offset);
330 if ( (*page)->iscached )
331 insertInCache(db, *page);
339 TBTWritePage(TBTree *db, TBTMemPage *page) {
340 off_t offset = (off_t)(page->pagenumber) * (off_t)TBTREEPAGESIZE;
343 if ( page->issynced )
346 if ( lseek(db->fd, offset, SEEK_SET) != offset ) {
347 tlog(TL_CRIT,"TBTWritePage: lseek failed: %s",strerror(errno));
351 if ( (size=write(db->fd, &( page->page ), TBTREEPAGESIZE)) != TBTREEPAGESIZE ) {
352 tlog(TL_CRIT,"TBTWritePage: write failed: %s (writed only %d bytes)",strerror(errno), size);
363 TBTNewPage(TBTree *db, TBTMemPage **page) {
364 if ( db->lastpagenumber==0 ) {
366 if ( (offset=lseek(db->fd, 0, SEEK_END)) < 0 ) {
367 tlog(TL_CRIT,"TBTNewPage: lseek failed: %s",strerror(errno));
371 if ( offset % TBTREEPAGESIZE != 0 ) {
372 tlog(TL_CRIT,"TBTNewPage: file corrupted");
376 db->lastpagenumber = offset/TBTREEPAGESIZE;
379 if( (*page = PageAlloc(db))==NULL )
382 (*page)->pagenumber = db->lastpagenumber;
386 db->lastpagenumber++;
388 memset( &((*page)->page), 0, TBTMEMPAGEHDRSZ);
389 (*page)->page.freespace = TBTREEPAGESIZE-TBTPAGEHDRSZ;
391 if ( (*page)->iscached )
392 insertInCache(db, *page);
398 printValue(u_int32_t len, char *ptr) {
408 dumpPage(TBTree *db, TBTPage *page, int follow) {
411 for(j=0;j<follow;j++)
413 printf("Page: f:%d l:%d r:%d n:%d\n",
419 for(i=0;i<page->npointer;i++) {
420 TBTPointer *ptr= (TBTPointer*)(page->data+db->pointersize*i);
421 for(j=0;j<follow+2;j++)
423 printf("i:%d(%d) kl:%d ko:%d ",
425 (db->keylen) ? db->keylen : ptr->key.varlen.length,
426 (db->keylen) ? 0 : ptr->key.varlen.offset
429 printValue(ptr->key.varlen.length, page->data + ptr->key.varlen.offset);
431 printf("'%d'", *(int*)(ptr->key.fixed.key));
432 if ( page->isleaf ) {
433 printf(" vl:%d vo:%d ",
434 ptr->pointer.leaf.length,
435 ptr->pointer.leaf.offset
437 printValue(ptr->pointer.leaf.length, page->data + ptr->pointer.leaf.offset );
440 printf(" page:%d\n", ptr->pointer.internal.link);
442 dumpTree(db, ptr->pointer.internal.link, follow+4);
448 dumpTree(TBTree *db, u_int32_t pagenumber, int follow) {
451 if ( TBTReadPage(db, pagenumber, &page) ) {
452 tlog(TL_CRIT,"BEDA %d\n", pagenumber);
457 dumpPage(db, &(page->page), follow);
459 if ( !page->iscached )
464 findInPage(TBTree *db, TBTPage *page, TBTValue *key, TBTPointer **res) {
465 TBTPointer *StopLow = (TBTPointer*)(page->data);
466 TBTPointer *StopHigh = (TBTPointer*)(page->data + db->pointersize*page->npointer);
467 TBTPointer *StopMiddle;
471 A.length = db->keylen;
472 /* Loop invariant: StopLow <= StopMiddle < StopHigh */
473 while (StopLow < StopHigh) {
474 StopMiddle = (TBTPointer*)(((char*)StopLow) + db->pointersize*((((char*)StopHigh - (char*)StopLow)/db->pointersize) >> 1));
475 if ( db->keylen == 0 ) {
476 A.length = StopMiddle->key.varlen.length;
477 A.value = page->data + StopMiddle->key.varlen.offset;
479 A.value = StopMiddle->key.fixed.key;
482 if ( ISINFPOINTER(db, page, StopMiddle) )
485 diff = db->cmpkey( &A, key );
489 } else if ( diff < 0 )
490 StopLow = (TBTPointer*)((char*)StopMiddle + db->pointersize);
492 StopHigh = StopMiddle;
500 findPage(TBTree *db, TBTValue *key, TBTMemPage **page) {
501 u_int32_t pagenumber = TBTPAGEROOT;
508 if ( (rc=TBTReadPage(db, pagenumber, page))!=TBT_OK )
511 if ( (*page)->page.isleaf )
514 findInPage(db, &((*page)->page), key, &ptr );
515 pagenumber = ptr->pointer.internal.link;
516 if ( !(*page)->iscached )
524 TBTFind(TBTree *db, TBTValue *key, TBTValue *value) {
529 memset(value, 0, sizeof(TBTValue));
531 if ( (rc=findPage(db, key, &page))!=TBT_OK )
537 if ( findInPage(db, &(page->page), key, &ptr) != FOUND ) {
538 if ( !(page)->iscached )
543 value->length = ptr->pointer.leaf.length;
544 value->value = tmalloc(value->length);
545 memcpy( value->value, page->page.data + ptr->pointer.leaf.offset, value->length );
547 if ( !(page)->iscached )
554 packLeafKV(TBTree *db, TBTPage *page, TBTPointer *ptr, TBTValue *key, TBTValue *value) {
556 if ( (char*)ptr - page->data != page->npointer * db->pointersize )
557 memmove((char*)ptr+db->pointersize, ptr, page->npointer * db->pointersize - ( (char*)ptr - page->data ));
559 ptr = (TBTPointer*)(page->data + page->npointer * db->pointersize);
562 page->freespace -= db->pointersize + PTRALIGN(value->length);
564 memcpy( ptr->key.fixed.key, key->value, db->keylen );
565 ptr->pointer.leaf.offset = page->npointer * db->pointersize + page->freespace;
567 page->freespace -= PTRALIGN(key->length);
568 ptr->key.varlen.length = key->length;
569 ptr->key.varlen.offset = page->npointer * db->pointersize + page->freespace;
570 memcpy( page->data + ptr->key.varlen.offset, key->value, key->length);
571 ptr->pointer.leaf.offset = page->npointer * db->pointersize + page->freespace + PTRALIGN(key->length);
573 ptr->pointer.leaf.length = value->length;
574 memcpy( page->data + ptr->pointer.leaf.offset, value->value, value->length );
578 packInternalKV(TBTree *db, TBTPage *page, TBTPointer *ptr, TBTValue *key, u_int32_t pagenumber) {
580 if ( (char*)ptr - page->data != page->npointer * db->pointersize )
581 memmove((char*)ptr+db->pointersize, ptr, page->npointer * db->pointersize - ( (char*)ptr - page->data ));
583 ptr = (TBTPointer*)(page->data + page->npointer * db->pointersize);
586 page->freespace -= db->pointersize;
590 memcpy( ptr->key.fixed.key, key->value, db->keylen );
592 memset( ptr->key.fixed.key, 0, db->keylen );
594 page->freespace -= PTRALIGN(key->length);
595 ptr->key.varlen.length = key->length;
596 ptr->key.varlen.offset = page->npointer * db->pointersize + page->freespace;
598 memcpy( page->data + ptr->key.varlen.offset, key->value, key->length);
600 ptr->pointer.internal.link = pagenumber;
605 deleteKV(TBTree *db, TBTPage *page, TBTPointer **ptrs, u_int32_t num) {
606 static char buf[TBTREEPAGESIZE];
608 TBTPointer *ptr = (TBTPointer*)page->data, *tmp, **ptrptr = ptrs;
610 /* suppose ptrs are ordered! */
612 while( ptrptr - ptrs < num && (char*)ptr - page->data < db->pointersize*page->npointer ) {
613 if ( *ptrptr < ptr ) {
614 tlog(TL_CRIT,"deleteKV: Something wrong: unexistent *ptrs");
616 } else if ( *ptrptr > ptr ) {
618 memcpy(tmp, ptr, db->pointersize);
619 ptr = (TBTPointer*)( (char*)ptr + db->pointersize );
620 tmp = (TBTPointer*)( (char*)tmp + db->pointersize );
622 page->freespace += db->pointersize +
623 ( ( page->isleaf ) ? PTRALIGN(ptr->pointer.leaf.length) : 0 ) +
624 ( ( db->keylen ) ? 0 : PTRALIGN(ptr->key.varlen.length) );
626 ptr = (TBTPointer*)( (char*)ptr + db->pointersize );
630 if ( (char*)ptr - page->data < db->pointersize*page->npointer && tmp!=ptr )
631 memmove( tmp, ptr, page->npointer * db->pointersize - ((char*)ptr - page->data) );
635 tmp = (TBTPointer*)page->data;
636 start = page->npointer * db->pointersize + page->freespace;
637 for(i=0;i<page->npointer;i++) {
638 if ( page->isleaf ) {
639 memcpy( bufptr, page->data + tmp->pointer.leaf.offset, tmp->pointer.leaf.length );
640 tmp->pointer.leaf.offset = start + (bufptr-buf);
641 bufptr+=PTRALIGN(tmp->pointer.leaf.length);
644 if ( db->keylen == 0 ) {
645 if ( tmp->key.varlen.length )
646 memcpy( bufptr, page->data + tmp->key.varlen.offset, tmp->key.varlen.length );
647 tmp->key.varlen.offset = start + (bufptr-buf);
648 bufptr+=PTRALIGN(tmp->key.varlen.length);
650 tmp = (TBTPointer*)( (char*)tmp + db->pointersize );
653 memcpy( page->data + start, buf, bufptr-buf );
657 findDelimiter(TBTree *db, TBTPage *page, TBTPointer *ptr, int size, u_int32_t start, int *where) {
658 int *sizes = (int*) tmalloc( sizeof(int) * (page->npointer+1) );
661 int lfree=TBTREEPAGESIZE-TBTPAGEHDRSZ, rfree=TBTREEPAGESIZE-TBTPAGEHDRSZ;
662 int nptr = ( (char*)ptr - page->data ) / db->pointersize;
665 for(i=0; i<page->npointer;i++) {
666 tomove = (TBTPointer*)(page->data + i*db->pointersize);
667 if ( was==0 && i==nptr ) {
671 sizes[i+was] = db->pointersize +
672 ( (page->isleaf) ? PTRALIGN(tomove->pointer.leaf.length) : 0 ) +
673 ( ( db->keylen ) ? 0 : PTRALIGN(tomove->key.varlen.length) );
678 sizes[page->npointer]=size;
681 for(i=0;i<page->npointer+1;i++)
692 } else if ( rfree < 0 ) {
700 *where = (nptr < start) ? -1 : 1;
709 populatePages(TBTree *db, TBTPage *srcpage, TBTPage* newpage, u_int32_t start) {
711 TBTPointer *tomove, **todelete = tmalloc( sizeof(TBTPointer*)* srcpage->npointer );
712 u_int32_t numtodelete=0;
715 for(i=start;i<srcpage->npointer;i++) {
716 tomove = (TBTPointer*)(srcpage->data + i*db->pointersize);
718 todelete[numtodelete] = tomove;
722 k.value = tomove->key.fixed.key;
724 k.length = tomove->key.varlen.length;
725 k.value = srcpage->data + tomove->key.varlen.offset;
728 if ( srcpage->isleaf ) {
729 v.length = tomove->pointer.leaf.length;
730 v.value = srcpage->data + tomove->pointer.leaf.offset;
731 packLeafKV(db, newpage, NULL, &k, &v);
733 packInternalKV(db, newpage, NULL, &k, tomove->pointer.internal.link);
736 deleteKV(db, srcpage, todelete, numtodelete);
741 splitPage(TBTree *db, TBTMemPage *srcpage, TBTMemPage** newpage, TBTPointer **ptr, int size, TBTValue *key, TBTValue *value, u_int32_t pagenumber) {
743 int start=0, where=0;
744 int nptr = ( (char*)(*ptr) - srcpage->page.data ) / db->pointersize;
746 if ( TBTNewPage(db, newpage) )
750 srcpage->issynced=tmp->issynced=0;
752 tmp->page.rightlink = srcpage->page.rightlink;
753 srcpage->page.rightlink = tmp->pagenumber;
754 tmp->page.isleaf = srcpage->page.isleaf;
756 switch(db->strategy) {
757 case TBT_STATEGY_HALF:
758 start = findDelimiter(db, &(srcpage->page), *ptr, size, (srcpage->page.npointer+1)/2, &where);
760 case TBT_STATEGY_LEFTFILL:
761 start = findDelimiter(db, &(srcpage->page), *ptr, size, srcpage->page.npointer, &where);
763 case TBT_STATEGY_RIGHTFILL:
764 start = findDelimiter(db, &(srcpage->page), *ptr, size, 0, &where);
767 tlog(TL_CRIT,"splitPage: unknown strategy: %d", db->strategy);
771 populatePages(db, &(srcpage->page), &(tmp->page), start);
776 *ptr = (TBTPointer*)(tmp->page.data + nptr*db->pointersize);
778 if ( tmp->page.isleaf )
779 packLeafKV(db, &(tmp->page), *ptr, key, value);
781 packInternalKV(db, &(tmp->page), *ptr, key, pagenumber);
786 savePage(TBTree *db, TBTMemPage *page) {
790 if ( !page->iscached ) {
791 if ( page->issynced==0 && (rc=TBTWritePage(db, page)) != TBT_OK )
799 layerInsert(TBTree *db, u_int32_t pagenumber, TBTMemPage **left, TBTMemPage **right, TBTValue *key, TBTValue *value) {
801 TBTPointer *ptr=NULL;
802 TBTMemPage *page=NULL, *newright=NULL;
804 if ( (rc=TBTReadPage(db, pagenumber, &page))!=TBT_OK )
809 fnd = findInPage(db, &(page->page), key, &ptr );
812 if ( page->page.isleaf ) {
813 u_int32_t size = db->pointersize +
814 PTRALIGN(value->length) +
815 ( ( db->keylen ) ? 0 : PTRALIGN(key->length) );
817 if ( size>(TBTREEPAGESIZE-TBTPAGEHDRSZ)/2 ) {
818 tlog(TL_ALARM,"Huge size: %d bytes", size);
823 deleteKV(db, &(page->page), &ptr, 1);
827 if ( size <= page->page.freespace ) {
828 u_int32_t oldsize=page->page.freespace;
829 packLeafKV(db, &(page->page), ptr, key, value);
830 tassert( oldsize == page->page.freespace + size );
833 rc = splitPage(db, page, &newright, &ptr, size, key, value, 0);
834 if ( rc != TBT_OK ) return rc;
836 page->issynced = newright->issynced = 0;
837 page->islocked = newright->islocked = 1;
840 if ( (rc=layerInsert(db, ptr->pointer.internal.link, left, right, key, value))!=TBT_OK ) {
845 if ( *left ) { /* child page was splited */
847 TBTPointer *rightptr = (TBTPointer*)( (*left)->page.data + ((*left)->page.npointer-1) * db->pointersize );
850 if ( ISINFPOINTER(db, &(page->page), ptr) ) {
851 deleteKV(db, &(page->page), &ptr, 1);
852 K.length = 0; K.value = NULL;
853 packInternalKV(db, &(page->page), ptr, &K, (*right)->pagenumber);
855 ptr->pointer.internal.link=(*right)->pagenumber;
857 K.length = ( db->keylen ) ? db->keylen : rightptr->key.varlen.length;
858 K.value = ( db->keylen ) ? rightptr->key.fixed.key : (*left)->page.data + rightptr->key.varlen.offset;
860 size = db->pointersize + ( ( db->keylen ) ? 0 : PTRALIGN(K.length) );
862 if ( size <= page->page.freespace ) {
863 packInternalKV(db, &(page->page), ptr, &K, (*left)->pagenumber);
866 rc = splitPage(db, page, &newright, &ptr, size, &K, NULL, (*left)->pagenumber);
867 if ( rc != TBT_OK ) return rc;
869 page->issynced = newright->issynced = 0;
870 page->islocked = newright->islocked = 1;
876 if ( (rc=savePage(db, *left))!=TBT_OK )
878 if ( (rc=savePage(db, *right))!=TBT_OK )
884 if ( page->pagenumber==TBTPAGEROOT ) { /* root was splited */
889 if ( (rc=TBTNewPage(db,&newleft))!=TBT_OK )
891 memcpy( &(newleft->page) , &(page->page), TBTREEPAGESIZE);
892 memset( &(page->page), 0, TBTREEPAGESIZE);
893 page->page.freespace=TBTREEPAGESIZE-TBTPAGEHDRSZ;
895 ptr = (TBTPointer*)( newleft->page.data + (newleft->page.npointer-1) * db->pointersize );
896 K.length = ( db->keylen ) ? db->keylen : ptr->key.varlen.length;
897 K.value = ( db->keylen ) ? ptr->key.fixed.key : newleft->page.data + ptr->key.varlen.offset;
898 packInternalKV(db, &(page->page), NULL, &K, newleft->pagenumber);
900 ptr = (TBTPointer*)( newright->page.data + (newright->page.npointer-1) * db->pointersize );
903 packInternalKV(db, &(page->page), NULL, &K, newright->pagenumber);
905 if ( (rc=savePage(db, newright))!=TBT_OK )
907 if ( (rc=savePage(db, newleft))!=TBT_OK )
909 if ( (rc=savePage(db, page))!=TBT_OK )
915 } else if ( (rc=savePage(db, page))!=TBT_OK )
923 TBTInsert(TBTree *db, TBTValue *key, TBTValue *value) {
925 TBTMemPage *lpage=NULL, *rpage=NULL;
927 rc = layerInsert(db, TBTPAGEROOT, &lpage, &rpage, key, value);
929 tassert( lpage==NULL && rpage==NULL );
935 TBTDelete(TBTree *db, TBTValue *key) {
940 if ( (rc=findPage(db, key, &page))!=TBT_OK )
946 if ( findInPage(db, &(page->page), key, &ptr) != FOUND ) {
947 if ( !(page)->iscached )
952 deleteKV(db, &(page->page), &ptr, 1);
955 rc=savePage(db, page);
961 TBTInitIterator(TBTree *db, TBTIterator *iterator) {
962 TBTMemPage *page=NULL;
965 u_int32_t pagenum=TBTPAGEROOT;
967 memset(iterator, 0, sizeof(TBTIterator));
970 if ( page && !page->iscached )
972 if ( (rc=TBTReadPage(db, pagenum, &page))!=TBT_OK )
975 ptr = (TBTPointer*)( page->page.data );
976 pagenum = ptr->pointer.internal.link;
977 } while( !page->page.isleaf );
980 iterator->page = page;
987 TBTFreeIterator(TBTree *db, TBTIterator *iterator) {
989 if ( iterator->page ) {
990 iterator->page->islocked=0;
991 if ( !iterator->page->iscached )
992 tfree(iterator->page);
995 memset(iterator, 0, sizeof(TBTIterator));
999 TBTIterate(TBTree *db, TBTIterator *iterator, TBTValue *key, TBTValue *value ) {
1002 if ( iterator->page==NULL ) {
1003 memset(key, 0, sizeof(TBTValue));
1004 memset(value, 0, sizeof(TBTValue));
1008 if ( (char*)(iterator->ptr) - iterator->page->page.data >= db->pointersize * iterator->page->page.npointer ) {
1010 u_int32_t pagenumber = iterator->page->page.rightlink;
1011 iterator->page->islocked=0;
1012 if ( !iterator->page->iscached )
1013 tfree(iterator->page);
1014 iterator->page=NULL;
1016 if ( pagenumber==TBTPAGEROOT ) {
1017 memset(key, 0, sizeof(TBTValue));
1018 memset(value, 0, sizeof(TBTValue));
1021 if ( (rc=TBTReadPage(db, pagenumber, &(iterator->page)))!=TBT_OK )
1023 } while ( iterator->page->page.npointer == 0 );
1025 iterator->page->islocked=1;
1026 iterator->ptr = (TBTPointer*)( iterator->page->page.data );
1030 key->length = db->keylen;
1031 key->value = iterator->ptr->key.fixed.key;
1033 key->length = iterator->ptr->key.varlen.length;
1034 key->value = iterator->page->page.data + iterator->ptr->key.varlen.offset;
1037 value->length = iterator->ptr->pointer.leaf.length;
1038 value->value = iterator->page->page.data + iterator->ptr->pointer.leaf.offset;
1040 iterator->ptr = ( TBTPointer* ) ( (char*)(iterator->ptr) + db->pointersize );