2 * Copyright (c) 2005 Teodor Sigaev <teodor@sigaev.ru>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the author nor the names of any co-contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY CONTRIBUTORS ``AS IS'' AND ANY EXPRESS
18 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL CONTRIBUTORS BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
25 * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
27 * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 #include <sys/types.h>
47 static int TBTReadPage(TBTree *db, u_int32_t pagenumber, TBTMemPage **page) ;
48 static int TBTWritePage(TBTree *db, TBTMemPage *page) ;
49 static int TBTNewPage(TBTree *db, TBTMemPage **page) ;
54 TBTOpen(TBTree *db, char *file) {
58 db->fd = open(file, O_RDONLY);
60 tlog(TL_CRIT,"TBTOpen: open failed: %s",strerror(errno));
63 if ( flock( db->fd, LOCK_SH ) < 0 ) {
64 tlog(TL_CRIT,"TBTOpen: flock failed: %s",strerror(errno));
69 db->fd = open(file, O_CREAT | O_RDWR, 0666);
71 tlog(TL_CRIT,"TBTOpen: open failed: %s",strerror(errno));
74 if ( flock( db->fd, LOCK_EX ) < 0 ) {
75 tlog(TL_CRIT,"TBTOpen: flock failed: %s",strerror(errno));
82 db->Cache=(TBTMemPage**)t0malloc( sizeof(TBTMemPage*)*HASHSIZE(db->npage) );
86 db->cachehits = db->pageread = db->pagewrite = 0;
88 if ( (offset=lseek(db->fd, 0, SEEK_END)) < 0 ) {
89 tlog(TL_CRIT,"TBTOpen: lseek failed: %s",strerror(errno));
90 flock( db->fd, LOCK_UN );
97 tlog(TL_CRIT,"TBTOpen: file is void");
98 flock( db->fd, LOCK_UN );
103 if ( TBTNewPage(db, &page) != TBT_OK ) {
104 flock( db->fd, LOCK_UN );
109 if ( TBTWritePage(db, page)!=TBT_OK ){
110 flock( db->fd, LOCK_UN );
117 db->pointersize = TBTPOINTERSIZE(db);
118 db->lastpagenumber=0;
123 TBTClose(TBTree *db) {
131 TBTMemPage *ptr,*tmp;
133 for(i=0; i<HASHSIZE(db->npage); i++) {
145 flock( db->fd, LOCK_UN );
148 return (rc) ? TBT_ERROR : TBT_OK;
152 cmpNPage(const void *a, const void *b) {
153 tassert( (*(TBTMemPage**)a)->pagenumber != (*(TBTMemPage**)b)->pagenumber );
154 return ( (*(TBTMemPage**)a)->pagenumber > (*(TBTMemPage**)b)->pagenumber ) ? 1 : -1;
159 TBTSync(TBTree *db) {
162 if ( db->readonly || db->npage==0 )
166 u_int32_t i, total=0;
167 TBTMemPage *ptr, **data = (TBTMemPage**)tmalloc(db->npage*sizeof(TBTMemPage*)), **pptr;
169 for(i=0; i<HASHSIZE(db->npage); i++) {
172 if ( !ptr->issynced ) {
183 qsort( data, total, sizeof(TBTMemPage*), cmpNPage);
186 while( pptr-data<total) {
187 rc |= TBTWritePage(db, *pptr);
194 if ( fsync(db->fd) ) {
195 tlog(TL_CRIT,"TBTSync: fsync failed: %s",strerror(errno));
199 return (rc) ? TBT_ERROR : TBT_OK;
203 findInCache(TBTree *db, u_int32_t pagenumber) {
206 ptr = db->Cache[ pagenumber % HASHSIZE(db->npage) ];
208 while(ptr && ptr->pagenumber != pagenumber)
215 removeFromCache(TBTree *db, u_int32_t pagenumber) {
216 TBTMemPage *ptr, *prev=NULL;
217 int pos = pagenumber % HASHSIZE(db->npage);
219 ptr = db->Cache[ pos ];
222 if ( ptr->pagenumber == pagenumber ) {
224 prev->link = ptr->link;
226 db->Cache[ pos ] = ptr->link;
235 insertInCache(TBTree *db, TBTMemPage *page) {
236 int pos = page->pagenumber % HASHSIZE(db->npage);
238 page->link = db->Cache[ pos ];
239 db->Cache[ pos ] = page;
243 PageAlloc(TBTree *db) {
246 if ( db->curpage < db->npage ) {
247 page = (TBTMemPage*)tmalloc(sizeof(TBTMemPage));
248 memset( page, 0, TBTMEMPAGEHDRSZ );
250 if ( db->curpage == 0 ) {
251 db->TimeCache = db->TimeCacheLast = page;
253 db->TimeCacheLast->next = page;
254 page->prev = db->TimeCacheLast;
255 db->TimeCacheLast = page;
258 } else if ( db->npage == 0 ) {
259 page = (TBTMemPage*)tmalloc(sizeof(TBTMemPage));
260 memset( page, 0, TBTMEMPAGEHDRSZ );
262 page = db->TimeCache;
263 while( page->next && page->islocked )
265 if ( page==db->TimeCacheLast && page->islocked ) { /*all pages locked*/
266 /* tlog(TL_WARN,"TBTReadPage: all pages in cache locked, increase number of cached pages"); */
267 page = (TBTMemPage*)tmalloc(sizeof(TBTMemPage));
268 memset( page, 0, TBTMEMPAGEHDRSZ );
270 if ( !page->issynced )
271 if ( TBTWritePage(db, page) )
273 removeFromCache(db, page->pagenumber);
274 if ( page != db->TimeCacheLast ) {
275 if ( page == db->TimeCache ) {
276 db->TimeCache = page->next;
277 db->TimeCache->prev = NULL;
279 page->prev->next = page->next;
280 page->next->prev = page->prev;
282 memset( page, 0, TBTMEMPAGEHDRSZ );
283 db->TimeCacheLast->next = page;
284 page->prev = db->TimeCacheLast;
285 db->TimeCacheLast = page;
287 memset( page, 0, TBTMEMPAGEHDRSZ );
296 TBTReadPage(TBTree *db, u_int32_t pagenumber, TBTMemPage **page) {
299 if ( db->npage && (*page=findInCache(db, pagenumber))!=NULL ) {
301 if ( *page != db->TimeCacheLast ) {
302 if ( (*page) == db->TimeCache ) {
303 db->TimeCache = (*page)->next;
304 db->TimeCache->prev=NULL;
306 (*page)->prev->next = (*page)->next;
307 (*page)->next->prev = (*page)->prev;
309 db->TimeCacheLast->next = (*page);
310 (*page)->prev = db->TimeCacheLast;
311 db->TimeCacheLast = *page;
312 (*page)->next = NULL;
315 off_t offset = (off_t)pagenumber * (off_t)TBTREEPAGESIZE;
317 if( (*page = PageAlloc(db))==NULL )
320 if ( lseek(db->fd, offset, SEEK_SET) != offset ) {
321 tlog(TL_CRIT,"TBTReadPage: lseek failed: %s",strerror(errno));
325 (*page)->pagenumber=pagenumber;
326 if ( (rc=read(db->fd, &( (*page)->page ), TBTREEPAGESIZE)) != TBTREEPAGESIZE ) {
327 tlog(TL_CRIT,"TBTReadPage: read failed: %s %d %d",strerror(errno), pagenumber, offset);
330 if ( (*page)->iscached )
331 insertInCache(db, *page);
339 TBTWritePage(TBTree *db, TBTMemPage *page) {
340 off_t offset = (off_t)(page->pagenumber) * (off_t)TBTREEPAGESIZE;
343 if ( page->issynced )
346 if ( lseek(db->fd, offset, SEEK_SET) != offset ) {
347 tlog(TL_CRIT,"TBTWritePage: lseek failed: %s",strerror(errno));
351 if ( (size=write(db->fd, &( page->page ), TBTREEPAGESIZE)) != TBTREEPAGESIZE ) {
352 tlog(TL_CRIT,"TBTWritePage: write failed: %s (writed only %d bytes)",strerror(errno), size);
363 TBTNewPage(TBTree *db, TBTMemPage **page) {
364 if ( db->lastpagenumber==0 ) {
366 if ( (offset=lseek(db->fd, 0, SEEK_END)) < 0 ) {
367 tlog(TL_CRIT,"TBTNewPage: lseek failed: %s",strerror(errno));
371 if ( offset % TBTREEPAGESIZE != 0 ) {
372 tlog(TL_CRIT,"TBTNewPage: file corrupted");
376 db->lastpagenumber = offset/TBTREEPAGESIZE;
379 if( (*page = PageAlloc(db))==NULL )
382 (*page)->pagenumber = db->lastpagenumber;
386 db->lastpagenumber++;
388 memset( &((*page)->page), 0, TBTMEMPAGEHDRSZ);
389 (*page)->page.freespace = TBTREEPAGESIZE-TBTPAGEHDRSZ;
391 if ( (*page)->iscached )
392 insertInCache(db, *page);
398 printValue(u_int32_t len, char *ptr) {
408 dumpPage(TBTree *db, TBTPage *page, int follow) {
411 for(j=0;j<follow;j++)
413 printf("Page: f:%d l:%d r:%d n:%d\n",
419 for(i=0;i<page->npointer;i++) {
420 TBTPointer *ptr= (TBTPointer*)(page->data+db->pointersize*i);
421 for(j=0;j<follow+2;j++)
423 printf("i:%d(%d) kl:%d ko:%d ",
425 (db->keylen) ? db->keylen : ptr->key.varlen.length,
426 (db->keylen) ? 0 : ptr->key.varlen.offset
429 printValue(ptr->key.varlen.length, page->data + ptr->key.varlen.offset);
431 printf("'%d'", *(int*)(ptr->key.fixed.key));
432 if ( page->isleaf ) {
433 printf(" vl:%d vo:%d ",
434 ptr->pointer.leaf.length,
435 ptr->pointer.leaf.offset
437 printValue(ptr->pointer.leaf.length, page->data + ptr->pointer.leaf.offset );
440 printf(" page:%d\n", ptr->pointer.internal.link);
442 dumpTree(db, ptr->pointer.internal.link, follow+4);
448 dumpTree(TBTree *db, u_int32_t pagenumber, int follow) {
451 if ( TBTReadPage(db, pagenumber, &page) ) {
452 tlog(TL_CRIT,"BEDA %d\n", pagenumber);
457 dumpPage(db, &(page->page), follow);
459 if ( !page->iscached )
464 findInPage(TBTree *db, TBTPage *page, TBTValue *key, TBTPointer **res) {
465 TBTPointer *StopLow = (TBTPointer*)(page->data);
466 TBTPointer *StopHigh = (TBTPointer*)(page->data + db->pointersize*page->npointer);
467 TBTPointer *StopMiddle;
471 A.length = db->keylen;
472 /* Loop invariant: StopLow <= StopMiddle < StopHigh */
473 while (StopLow < StopHigh) {
474 StopMiddle = (TBTPointer*)(((char*)StopLow) + db->pointersize*((((char*)StopHigh - (char*)StopLow)/db->pointersize) >> 1));
475 if ( db->keylen == 0 ) {
476 A.length = StopMiddle->key.varlen.length;
477 A.value = page->data + StopMiddle->key.varlen.offset;
479 A.value = StopMiddle->key.fixed.key;
482 if ( ISINFPOINTER(db, page, StopMiddle) )
485 diff = db->cmpkey( &A, key );
489 } else if ( diff < 0 )
490 StopLow = (TBTPointer*)((char*)StopMiddle + db->pointersize);
492 StopHigh = StopMiddle;
500 findPage(TBTree *db, TBTValue *key, TBTMemPage **page) {
501 u_int32_t pagenumber = TBTPAGEROOT;
508 if ( (rc=TBTReadPage(db, pagenumber, page))!=TBT_OK )
511 if ( (*page)->page.isleaf )
514 findInPage(db, &((*page)->page), key, &ptr );
515 pagenumber = ptr->pointer.internal.link;
516 if ( !(*page)->iscached )
524 TBTFind(TBTree *db, TBTValue *key, TBTValue *value) {
529 memset(value, 0, sizeof(TBTValue));
531 if ( (rc=findPage(db, key, &page))!=TBT_OK )
537 if ( findInPage(db, &(page->page), key, &ptr) != FOUND ) {
538 if ( !(page)->iscached )
543 value->length = ptr->pointer.leaf.length;
544 value->value = tmalloc(value->length);
545 memcpy( value->value, page->page.data + ptr->pointer.leaf.offset, value->length );
547 if ( !(page)->iscached )
554 packLeafKV(TBTree *db, TBTPage *page, TBTPointer *ptr, TBTValue *key, TBTValue *value) {
556 if ( (char*)ptr - page->data != page->npointer * db->pointersize )
557 memmove((char*)ptr+db->pointersize, ptr, page->npointer * db->pointersize - ( (char*)ptr - page->data ));
559 ptr = (TBTPointer*)(page->data + page->npointer * db->pointersize);
562 page->freespace -= db->pointersize + PTRALIGN(value->length);
564 memcpy( ptr->key.fixed.key, key->value, db->keylen );
565 ptr->pointer.leaf.offset = page->npointer * db->pointersize + page->freespace;
567 page->freespace -= PTRALIGN(key->length);
568 ptr->key.varlen.length = key->length;
569 ptr->key.varlen.offset = page->npointer * db->pointersize + page->freespace;
570 memcpy( page->data + ptr->key.varlen.offset, key->value, key->length);
571 ptr->pointer.leaf.offset = page->npointer * db->pointersize + page->freespace + PTRALIGN(key->length);
573 ptr->pointer.leaf.length = value->length;
574 memcpy( page->data + ptr->pointer.leaf.offset, value->value, value->length );
578 packInternalKV(TBTree *db, TBTPage *page, TBTPointer *ptr, TBTValue *key, u_int32_t pagenumber) {
580 if ( (char*)ptr - page->data != page->npointer * db->pointersize )
581 memmove((char*)ptr+db->pointersize, ptr, page->npointer * db->pointersize - ( (char*)ptr - page->data ));
583 ptr = (TBTPointer*)(page->data + page->npointer * db->pointersize);
586 page->freespace -= db->pointersize;
590 memcpy( ptr->key.fixed.key, key->value, db->keylen );
592 page->freespace -= PTRALIGN(key->length);
593 ptr->key.varlen.length = key->length;
594 ptr->key.varlen.offset = page->npointer * db->pointersize + page->freespace;
596 memcpy( page->data + ptr->key.varlen.offset, key->value, key->length);
598 ptr->pointer.internal.link = pagenumber;
602 deleteKV(TBTree *db, TBTPage *page, TBTPointer **ptrs, u_int32_t num) {
603 TBTPointer *ptr = (TBTPointer*)page->data, *tmp, **ptrptr = ptrs;
606 /* suppose ptrs are ordered! */
608 while( ptrptr - ptrs < num && (char*)ptr - page->data < db->pointersize*page->npointer ) {
609 if ( *ptrptr < ptr ) {
610 tlog(TL_CRIT,"deleteKV: Something wrong: unexistent *ptrs");
612 } else if ( *ptrptr > ptr ) {
614 memcpy(tmp, ptr, db->pointersize);
615 ptr = (TBTPointer*)( (char*)ptr + db->pointersize );
616 tmp = (TBTPointer*)( (char*)tmp + db->pointersize );
618 page->freespace += db->pointersize +
619 ( ( page->isleaf ) ? PTRALIGN(ptr->pointer.leaf.length) : 0 ) +
620 ( ( db->keylen ) ? 0 : PTRALIGN(ptr->key.varlen.length) );
622 ptr = (TBTPointer*)( (char*)ptr + db->pointersize );
626 if ( (char*)ptr - page->data < db->pointersize*page->npointer && tmp!=ptr )
627 memmove( tmp, ptr, page->npointer * db->pointersize - ((char*)ptr - page->data) );
631 if ( page->isleaf || db->keylen == 0 ) {
632 static char buf[TBTREEPAGESIZE];
634 u_int32_t start = page->npointer * db->pointersize + page->freespace;
636 tmp = (TBTPointer*)page->data;
637 for(i=0;i<page->npointer;i++) {
638 if ( page->isleaf ) {
639 memcpy( bufptr, page->data + tmp->pointer.leaf.offset, tmp->pointer.leaf.length );
640 tmp->pointer.leaf.offset = start + (bufptr-buf);
641 bufptr+=PTRALIGN(tmp->pointer.leaf.length);
644 if ( db->keylen == 0 ) {
645 if ( tmp->key.varlen.length )
646 memcpy( bufptr, page->data + tmp->key.varlen.offset, tmp->key.varlen.length );
647 tmp->key.varlen.offset = start + (bufptr-buf);
648 bufptr+=PTRALIGN(tmp->key.varlen.length);
650 tmp = (TBTPointer*)( (char*)tmp + db->pointersize );
653 memcpy( page->data + start, buf, bufptr-buf );
658 findDelimiter(TBTree *db, TBTPage *page, TBTPointer *ptr, int size, u_int32_t start, int *where) {
659 int *sizes = (int*) tmalloc( sizeof(int) * (page->npointer+1) );
662 int lfree=TBTREEPAGESIZE-TBTPAGEHDRSZ, rfree=TBTREEPAGESIZE-TBTPAGEHDRSZ;
663 int nptr = ( (char*)ptr - page->data ) / db->pointersize;
666 for(i=0; i<page->npointer;i++) {
667 tomove = (TBTPointer*)(page->data + i*db->pointersize);
668 if ( was==0 && i==nptr ) {
672 sizes[i+was] = db->pointersize +
673 ( (page->isleaf) ? PTRALIGN(tomove->pointer.leaf.length) : 0 ) +
674 ( ( db->keylen ) ? 0 : PTRALIGN(tomove->key.varlen.length) );
679 sizes[page->npointer]=size;
682 for(i=0;i<page->npointer+1;i++)
693 } else if ( rfree < 0 ) {
701 *where = (nptr < start) ? -1 : 1;
710 populatePages(TBTree *db, TBTPage *srcpage, TBTPage* newpage, u_int32_t start) {
712 TBTPointer *tomove, **todelete = tmalloc( sizeof(TBTPointer*)* srcpage->npointer );
713 u_int32_t numtodelete=0;
716 for(i=start;i<srcpage->npointer;i++) {
717 tomove = (TBTPointer*)(srcpage->data + i*db->pointersize);
719 todelete[numtodelete] = tomove;
723 k.value = tomove->key.fixed.key;
725 k.length = tomove->key.varlen.length;
726 k.value = srcpage->data + tomove->key.varlen.offset;
729 if ( srcpage->isleaf ) {
730 v.length = tomove->pointer.leaf.length;
731 v.value = srcpage->data + tomove->pointer.leaf.offset;
732 packLeafKV(db, newpage, NULL, &k, &v);
734 packInternalKV(db, newpage, NULL, &k, tomove->pointer.internal.link);
737 deleteKV(db, srcpage, todelete, numtodelete);
742 splitPage(TBTree *db, TBTMemPage *srcpage, TBTMemPage** newpage, TBTPointer **ptr, int size, TBTValue *key, TBTValue *value, u_int32_t pagenumber) {
744 int start=0, where=0;
745 int nptr = ( (char*)(*ptr) - srcpage->page.data ) / db->pointersize;
747 if ( TBTNewPage(db, newpage) )
751 srcpage->issynced=tmp->issynced=0;
753 tmp->page.rightlink = srcpage->page.rightlink;
754 srcpage->page.rightlink = tmp->pagenumber;
755 tmp->page.isleaf = srcpage->page.isleaf;
757 switch(db->strategy) {
758 case TBT_STATEGY_HALF:
759 start = findDelimiter(db, &(srcpage->page), *ptr, size, (srcpage->page.npointer+1)/2, &where);
761 case TBT_STATEGY_LEFTFILL:
762 start = findDelimiter(db, &(srcpage->page), *ptr, size, srcpage->page.npointer, &where);
764 case TBT_STATEGY_RIGHTFILL:
765 start = findDelimiter(db, &(srcpage->page), *ptr, size, 0, &where);
768 tlog(TL_CRIT,"splitPage: unknown strategy: %d", db->strategy);
772 populatePages(db, &(srcpage->page), &(tmp->page), start);
777 *ptr = (TBTPointer*)(tmp->page.data + nptr*db->pointersize);
779 if ( tmp->page.isleaf )
780 packLeafKV(db, &(tmp->page), *ptr, key, value);
782 packInternalKV(db, &(tmp->page), *ptr, key, pagenumber);
787 savePage(TBTree *db, TBTMemPage *page) {
791 if ( !page->iscached ) {
792 if ( page->issynced==0 && (rc=TBTWritePage(db, page)) != TBT_OK )
800 layerInsert(TBTree *db, u_int32_t pagenumber, TBTMemPage **left, TBTMemPage **right, TBTValue *key, TBTValue *value) {
802 TBTPointer *ptr=NULL;
803 TBTMemPage *page=NULL, *newright=NULL;
805 if ( (rc=TBTReadPage(db, pagenumber, &page))!=TBT_OK )
810 fnd = findInPage(db, &(page->page), key, &ptr );
813 if ( page->page.isleaf ) {
814 u_int32_t size = db->pointersize +
815 PTRALIGN(value->length) +
816 ( ( db->keylen ) ? 0 : PTRALIGN(key->length) );
818 if ( size>(TBTREEPAGESIZE-TBTPAGEHDRSZ)/2 ) {
819 tlog(TL_ALARM,"Huge size: %d bytes", size);
824 deleteKV(db, &(page->page), &ptr, 1);
828 if ( size <= page->page.freespace ) {
829 u_int32_t oldsize=page->page.freespace;
830 packLeafKV(db, &(page->page), ptr, key, value);
831 tassert( oldsize == page->page.freespace + size );
834 rc = splitPage(db, page, &newright, &ptr, size, key, value, 0);
835 if ( rc != TBT_OK ) return rc;
837 page->issynced = newright->issynced = 0;
838 page->islocked = newright->islocked = 1;
841 if ( (rc=layerInsert(db, ptr->pointer.internal.link, left, right, key, value))!=TBT_OK ) {
846 if ( *left ) { /* child page was splited */
848 TBTPointer *rightptr = (TBTPointer*)( (*left)->page.data + ((*left)->page.npointer-1) * db->pointersize );
851 if ( ISINFPOINTER(db, &(page->page), ptr) ) {
852 deleteKV(db, &(page->page), &ptr, 1);
853 K.length = 0; K.value = NULL;
854 packInternalKV(db, &(page->page), ptr, &K, (*right)->pagenumber);
856 ptr->pointer.internal.link=(*right)->pagenumber;
858 K.length = ( db->keylen ) ? db->keylen : rightptr->key.varlen.length;
859 K.value = ( db->keylen ) ? rightptr->key.fixed.key : (*left)->page.data + rightptr->key.varlen.offset;
861 size = db->pointersize + ( ( db->keylen ) ? 0 : PTRALIGN(K.length) );
863 if ( size <= page->page.freespace ) {
864 packInternalKV(db, &(page->page), ptr, &K, (*left)->pagenumber);
867 rc = splitPage(db, page, &newright, &ptr, size, &K, NULL, (*left)->pagenumber);
868 if ( rc != TBT_OK ) return rc;
870 page->issynced = newright->issynced = 0;
871 page->islocked = newright->islocked = 1;
877 if ( (rc=savePage(db, *left))!=TBT_OK )
879 if ( (rc=savePage(db, *right))!=TBT_OK )
885 if ( page->pagenumber==TBTPAGEROOT ) { /* root was splited */
890 if ( (rc=TBTNewPage(db,&newleft))!=TBT_OK )
892 memcpy( &(newleft->page) , &(page->page), TBTREEPAGESIZE);
893 memset( &(page->page), 0, TBTREEPAGESIZE);
894 page->page.freespace=TBTREEPAGESIZE-TBTPAGEHDRSZ;
896 ptr = (TBTPointer*)( newleft->page.data + (newleft->page.npointer-1) * db->pointersize );
897 K.length = ( db->keylen ) ? db->keylen : ptr->key.varlen.length;
898 K.value = ( db->keylen ) ? ptr->key.fixed.key : newleft->page.data + ptr->key.varlen.offset;
899 packInternalKV(db, &(page->page), NULL, &K, newleft->pagenumber);
901 ptr = (TBTPointer*)( newright->page.data + (newright->page.npointer-1) * db->pointersize );
904 packInternalKV(db, &(page->page), NULL, &K, newright->pagenumber);
906 if ( (rc=savePage(db, newright))!=TBT_OK )
908 if ( (rc=savePage(db, newleft))!=TBT_OK )
910 if ( (rc=savePage(db, page))!=TBT_OK )
916 } else if ( (rc=savePage(db, page))!=TBT_OK )
924 TBTInsert(TBTree *db, TBTValue *key, TBTValue *value) {
926 TBTMemPage *lpage=NULL, *rpage=NULL;
928 rc = layerInsert(db, TBTPAGEROOT, &lpage, &rpage, key, value);
930 tassert( lpage==NULL && rpage==NULL );
936 TBTDelete(TBTree *db, TBTValue *key) {
941 if ( (rc=findPage(db, key, &page))!=TBT_OK )
947 if ( findInPage(db, &(page->page), key, &ptr) != FOUND ) {
948 if ( !(page)->iscached )
953 deleteKV(db, &(page->page), &ptr, 1);
956 rc=savePage(db, page);
962 TBTInitIterator(TBTree *db, TBTIterator *iterator) {
963 TBTMemPage *page=NULL;
966 u_int32_t pagenum=TBTPAGEROOT;
968 memset(iterator, 0, sizeof(TBTIterator));
971 if ( page && !page->iscached )
973 if ( (rc=TBTReadPage(db, pagenum, &page))!=TBT_OK )
976 ptr = (TBTPointer*)( page->page.data );
977 pagenum = ptr->pointer.internal.link;
978 } while( !page->page.isleaf );
981 iterator->page = page;
988 TBTFreeIterator(TBTree *db, TBTIterator *iterator) {
990 if ( iterator->page ) {
991 iterator->page->islocked=0;
992 if ( !iterator->page->iscached )
993 tfree(iterator->page);
996 memset(iterator, 0, sizeof(TBTIterator));
1000 TBTIterate(TBTree *db, TBTIterator *iterator, TBTValue *key, TBTValue *value ) {
1003 if ( iterator->page==NULL ) {
1004 memset(key, 0, sizeof(TBTValue));
1005 memset(value, 0, sizeof(TBTValue));
1009 if ( (char*)(iterator->ptr) - iterator->page->page.data >= db->pointersize * iterator->page->page.npointer ) {
1011 u_int32_t pagenumber = iterator->page->page.rightlink;
1012 iterator->page->islocked=0;
1013 if ( !iterator->page->iscached )
1014 tfree(iterator->page);
1015 iterator->page=NULL;
1017 if ( pagenumber==TBTPAGEROOT ) {
1018 memset(key, 0, sizeof(TBTValue));
1019 memset(value, 0, sizeof(TBTValue));
1022 if ( (rc=TBTReadPage(db, pagenumber, &(iterator->page)))!=TBT_OK )
1024 } while ( iterator->page->page.npointer == 0 );
1026 iterator->page->islocked=1;
1027 iterator->ptr = (TBTPointer*)( iterator->page->page.data );
1031 key->length = db->keylen;
1032 key->value = iterator->ptr->key.fixed.key;
1034 key->length = iterator->ptr->key.varlen.length;
1035 key->value = iterator->page->page.data + iterator->ptr->key.varlen.offset;
1038 value->length = iterator->ptr->pointer.leaf.length;
1039 value->value = iterator->page->page.data + iterator->ptr->pointer.leaf.offset;
1041 iterator->ptr = ( TBTPointer* ) ( (char*)(iterator->ptr) + db->pointersize );
1047 TBTGetFirst(TBTree *db, TBTValue *key, TBTValue *value) {
1048 TBTIterator iterator;
1051 if ( (rc=TBTInitIterator(db, &iterator))!=TBT_OK )
1054 rc=TBTIterate(db, &iterator, key, value);
1059 ptr = tmalloc( key->length );
1060 memcpy( ptr, key->value, key->length );
1063 ptr = tmalloc( value->length );
1064 memcpy( ptr, value->value, value->length );
1068 TBTFreeIterator(db, &iterator);
1074 findLast(TBTree *db, TBTValue *key, TBTValue *value, u_int32_t pagenumber) {
1079 if ( (rc=TBTReadPage(db, pagenumber, &page)) != TBT_OK )
1082 if ( page->page.npointer==0 ) {
1083 if ( !page->iscached )
1089 if ( page->page.isleaf ) {
1090 ptr = (TBTPointer*)(page->page.data + db->pointersize * (page->page.npointer-1));
1092 key->length = ( db->keylen ) ? db->keylen : ptr->key.varlen.length;
1093 key->value = tmalloc( key->length );
1094 memcpy( key->value, ( db->keylen ) ? ptr->key.fixed.key : page->page.data + ptr->key.varlen.offset, key->length );
1096 value->length = ptr->pointer.leaf.length;
1097 value->value = tmalloc( value->length );
1098 memcpy( value->value, page->page.data + ptr->pointer.leaf.offset, value->length );
1100 for(i=page->page.npointer-1; i>=0; i--) {
1101 ptr = (TBTPointer*)( page->page.data + db->pointersize * i );
1102 rc = findLast(db, key, value, ptr->pointer.internal.link);
1103 if ( rc!=TBT_OK || key->value )
1109 if ( !page->iscached )
1116 TBTGetLast(TBTree *db, TBTValue *key, TBTValue *value) {
1117 memset(key, 0, sizeof(TBTValue));
1118 memset(value, 0, sizeof(TBTValue));
1119 return findLast(db, key, value, TBTPAGEROOT);