2 * Copyright (c) 2005 Teodor Sigaev <teodor@sigaev.ru>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the author nor the names of any co-contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY CONTRIBUTORS ``AS IS'' AND ANY EXPRESS
18 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL CONTRIBUTORS BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
25 * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
27 * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 #include <sys/types.h>
47 static int TBTReadPage(TBTree *db, u_int32_t pagenumber, TBTMemPage **page) ;
48 static int TBTWritePage(TBTree *db, TBTMemPage *page) ;
49 static int TBTNewPage(TBTree *db, TBTMemPage **page) ;
54 TBTOpen(TBTree *db, char *file) {
58 db->fd = open(file, O_RDONLY);
60 tlog(TL_CRIT,"TBTOpen: open failed: %s",strerror(errno));
63 if ( flock( db->fd, LOCK_SH ) < 0 ) {
64 tlog(TL_CRIT,"TBTOpen: flock failed: %s",strerror(errno));
69 db->fd = open(file, O_CREAT | O_RDWR, 0666);
71 tlog(TL_CRIT,"TBTOpen: open failed: %s",strerror(errno));
74 if ( flock( db->fd, LOCK_EX ) < 0 ) {
75 tlog(TL_CRIT,"TBTOpen: flock failed: %s",strerror(errno));
82 db->Cache=(TBTMemPage**)t0malloc( sizeof(TBTMemPage*)*db->npage );
86 db->cachehits = db->pageread = db->pagewrite = 0;
88 if ( (offset=lseek(db->fd, 0, SEEK_END)) < 0 ) {
89 tlog(TL_CRIT,"TBTOpen: lseek failed: %s",strerror(errno));
90 flock( db->fd, LOCK_UN );
97 tlog(TL_CRIT,"TBTOpen: file is void");
98 flock( db->fd, LOCK_UN );
103 if ( TBTNewPage(db, &page) != TBT_OK ) {
104 flock( db->fd, LOCK_UN );
109 if ( TBTWritePage(db, page)!=TBT_OK ){
110 flock( db->fd, LOCK_UN );
117 db->lastpagenumber=0;
122 TBTClose(TBTree *db) {
131 for(i=0; i<db->curpage; i++)
132 tfree( db->Cache[i] );
137 flock( db->fd, LOCK_UN );
140 return (rc) ? TBT_ERROR : TBT_OK;
144 TBTSync(TBTree *db) {
148 if ( db->readonly || db->npage==0 )
151 for(i=0; i<db->curpage; i++)
152 if ( !db->Cache[i]->issynced )
153 rc |= TBTWritePage(db, db->Cache[i]);
155 if ( fsync(db->fd) ) {
156 tlog(TL_CRIT,"TBTSync: fsync failed: %s",strerror(errno));
160 return (rc) ? TBT_ERROR : TBT_OK;
164 findInCache(TBTree *db, u_int32_t pagenumber, int *pos, TBTMemPage **StopLow, TBTMemPage **StopHigh) {
165 TBTMemPage **StopMiddle;
167 /* Loop invariant: StopLow <= StopMiddle < StopHigh */
168 while (StopLow < StopHigh) {
169 StopMiddle = StopLow + ( ( StopHigh - StopLow ) >> 1 );
171 if ( (*StopMiddle)->pagenumber == pagenumber ) {
172 *pos = StopMiddle - db->Cache;
174 } else if ( (*StopMiddle)->pagenumber < pagenumber )
175 StopLow = StopMiddle + 1;
177 StopHigh = StopMiddle;
180 *pos = StopHigh - db->Cache;
185 PageAlloc(TBTree *db, int *posincache) {
189 if ( db->curpage < db->npage ) {
190 page = db->Cache[db->curpage] = (TBTMemPage*)tmalloc(sizeof(TBTMemPage));
191 memset( page, 0, TBTMEMPAGEHDRSZ );
193 if ( db->curpage == 0 ) {
194 db->TimeCache = db->TimeCacheLast = page;
196 db->TimeCacheLast->next = page;
197 page->prev = db->TimeCacheLast;
198 db->TimeCacheLast = page;
200 *posincache = db->curpage;
202 } else if ( db->npage == 0 ) {
203 page = (TBTMemPage*)tmalloc(sizeof(TBTMemPage));
204 memset( page, 0, TBTMEMPAGEHDRSZ );
206 page = db->TimeCache;
207 while( page->next && page->islocked )
209 if ( page==db->TimeCacheLast && page->islocked ) { /*all pages locked*/
210 /*tlog(TL_WARN,"TBTReadPage: all pages in cache locked, increase number of cached pages");*/
211 page = (TBTMemPage*)tmalloc(sizeof(TBTMemPage));
212 memset( page, 0, TBTMEMPAGEHDRSZ );
214 if ( !page->issynced )
215 if ( TBTWritePage(db, page) )
217 if( findInCache(db, page->pagenumber, posincache, db->Cache, db->Cache + db->curpage) != FOUND )
218 tlog(TL_CRIT,"PageAlloc: something wrong: no page with pagenumber %d in cache", page->pagenumber);
219 memset( page, 0, TBTMEMPAGEHDRSZ );
221 if ( page != db->TimeCacheLast ) {
223 page->prev->next = page->next;
225 page->next->prev = page->prev;
226 db->TimeCacheLast->next = page;
227 page->prev = db->TimeCacheLast;
228 db->TimeCacheLast = page;
238 findAndMovePN(TBTree *db, TBTMemPage *page, int pos) {
241 if ( db->curpage < 2 )
246 tassert( pos < db->curpage && db->Cache[pos]==page );
248 if ( pos+1 != db->curpage && db->Cache[pos+1]->pagenumber < page->pagenumber ) {
249 if ( findInCache(db, page->pagenumber, &newpos, db->Cache + pos + 1, db->Cache + db->curpage )!=NOTFOUND)
250 tlog(TL_CRIT,"findAndMovePN: Something wrong, duplicate page with pagenumber %d", page->pagenumber);
251 memmove( db->Cache + pos, db->Cache + pos + 1, (newpos-pos-1)*sizeof(TBTMemPage**) );
252 db->Cache[newpos-1] = page;
253 } else if ( pos!=0 && db->Cache[pos-1]->pagenumber > page->pagenumber ) {
254 if( findInCache(db, page->pagenumber, &newpos, db->Cache, db->Cache + pos )!=NOTFOUND);
255 tlog(TL_CRIT,"findAndMovePN: Something wrong, duplicate page with pagenumber %d", page->pagenumber);
256 memmove( db->Cache + newpos + 1, db->Cache + newpos, (pos-newpos)*sizeof(TBTMemPage**));
257 db->Cache[newpos] = page;
262 TBTReadPage(TBTree *db, u_int32_t pagenumber, TBTMemPage **page) {
265 if ( findInCache(db, pagenumber, &pos, db->Cache, db->Cache + db->curpage)==FOUND ) {
267 *page=db->Cache[pos];
268 if ( *page != db->TimeCacheLast ) {
270 (*page)->prev->next = (*page)->next;
272 (*page)->next->prev = (*page)->prev;
273 db->TimeCacheLast->next = (*page);
274 (*page)->prev = db->TimeCacheLast;
275 db->TimeCacheLast = *page;
276 (*page)->next = NULL;
279 off_t offset = (off_t)pagenumber * (off_t)TBTREEPAGESIZE;
281 if( (*page = PageAlloc(db, &pos))==NULL )
284 if ( lseek(db->fd, offset, SEEK_SET) != offset ) {
285 tlog(TL_CRIT,"TBTReadPage: lseek failed: %s",strerror(errno));
289 (*page)->pagenumber=pagenumber;
290 if ( (rc=read(db->fd, &( (*page)->page ), TBTREEPAGESIZE)) != TBTREEPAGESIZE ) {
291 tlog(TL_CRIT,"TBTReadPage: read failed: %s %d %d",strerror(errno), pagenumber, offset);
294 if ( (*page)->iscached )
295 findAndMovePN(db, *page, pos);
303 TBTWritePage(TBTree *db, TBTMemPage *page) {
304 off_t offset = (off_t)(page->pagenumber) * (off_t)TBTREEPAGESIZE;
307 if ( page->issynced )
310 if ( lseek(db->fd, offset, SEEK_SET) != offset ) {
311 tlog(TL_CRIT,"TBTWritePage: lseek failed: %s",strerror(errno));
315 if ( (size=write(db->fd, &( page->page ), TBTREEPAGESIZE)) != TBTREEPAGESIZE ) {
316 tlog(TL_CRIT,"TBTWritePage: write failed: %s (writed only %d bytes)",strerror(errno), size);
327 TBTNewPage(TBTree *db, TBTMemPage **page) {
330 if ( db->lastpagenumber==0 ) {
332 if ( (offset=lseek(db->fd, 0, SEEK_END)) < 0 ) {
333 tlog(TL_CRIT,"TBTNewPage: lseek failed: %s",strerror(errno));
337 if ( offset % TBTREEPAGESIZE != 0 ) {
338 tlog(TL_CRIT,"TBTNewPage: file corrupted");
342 db->lastpagenumber = offset/TBTREEPAGESIZE;
345 if( (*page = PageAlloc(db, &pos))==NULL )
348 (*page)->pagenumber = db->lastpagenumber;
352 db->lastpagenumber++;
354 memset( &((*page)->page), 0, TBTMEMPAGEHDRSZ);
355 (*page)->page.freespace = TBTREEPAGESIZE-TBTPAGEHDRSZ;
357 if ( (*page)->iscached )
358 findAndMovePN(db, *page, pos);
364 printValue(u_int32_t len, char *ptr) {
374 dumpPage(TBTree *db, TBTPage *page, int follow) {
377 for(j=0;j<follow;j++)
379 printf("Page: f:%d l:%d r:%d n:%d\n",
385 for(i=0;i<page->npointer;i++) {
386 TBTPointer *ptr= (TBTPointer*)(page->data+TBTPOINTERSIZE(db)*i);
387 for(j=0;j<follow+2;j++)
389 printf("i:%d(%d) kl:%d ko:%d ",
391 (db->keylen) ? db->keylen : ptr->key.varlen.length,
392 (db->keylen) ? 0 : ptr->key.varlen.offset
395 printValue(ptr->key.varlen.length, page->data + ptr->key.varlen.offset);
397 printf("'%d'", *(int*)(ptr->key.fixed.key));
398 if ( page->isleaf ) {
399 printf(" vl:%d vo:%d ",
400 ptr->pointer.leaf.length,
401 ptr->pointer.leaf.offset
403 printValue(ptr->pointer.leaf.length, page->data + ptr->pointer.leaf.offset );
406 printf(" page:%d\n", ptr->pointer.internal.link);
408 dumpTree(db, ptr->pointer.internal.link, follow+4);
414 dumpTree(TBTree *db, u_int32_t pagenumber, int follow) {
417 if ( TBTReadPage(db, pagenumber, &page) ) {
418 tlog(TL_CRIT,"BEDA %d\n", pagenumber);
423 dumpPage(db, &(page->page), follow);
425 if ( !page->iscached )
430 findInPage(TBTree *db, TBTPage *page, TBTValue *key, TBTPointer **res) {
431 TBTPointer *StopLow = (TBTPointer*)(page->data);
432 TBTPointer *StopHigh = (TBTPointer*)(page->data + TBTPOINTERSIZE(db)*page->npointer);
433 TBTPointer *StopMiddle;
437 /* Loop invariant: StopLow <= StopMiddle < StopHigh */
438 while (StopLow < StopHigh) {
439 StopMiddle = (TBTPointer*)(((char*)StopLow) + TBTPOINTERSIZE(db)*((((char*)StopHigh - (char*)StopLow)/TBTPOINTERSIZE(db)) >> 1));
440 if ( db->keylen == 0 ) {
441 A.length = StopMiddle->key.varlen.length;
442 A.value = page->data + StopMiddle->key.varlen.offset;
444 A.length = db->keylen;
445 A.value = StopMiddle->key.fixed.key;
448 if ( ISINFPOINTER(db, page, StopMiddle) )
451 diff = db->cmpkey( &A, key );
455 } else if ( diff < 0 )
456 StopLow = (TBTPointer*)((char*)StopMiddle + TBTPOINTERSIZE(db));
458 StopHigh = StopMiddle;
466 findPage(TBTree *db, TBTValue *key, TBTMemPage **page) {
467 u_int32_t pagenumber = TBTPAGEROOT;
474 if ( (rc=TBTReadPage(db, pagenumber, page))!=TBT_OK )
477 if ( (*page)->page.isleaf )
480 findInPage(db, &((*page)->page), key, &ptr );
481 pagenumber = ptr->pointer.internal.link;
482 if ( !(*page)->iscached )
490 TBTFind(TBTree *db, TBTValue *key, TBTValue *value) {
495 memset(value, 0, sizeof(TBTValue));
497 if ( (rc=findPage(db, key, &page))!=TBT_OK )
503 if ( findInPage(db, &(page->page), key, &ptr) != FOUND ) {
504 if ( !(page)->iscached )
509 value->length = ptr->pointer.leaf.length;
510 value->value = tmalloc(value->length);
511 memcpy( value->value, page->page.data + ptr->pointer.leaf.offset, value->length );
513 if ( !(page)->iscached )
520 packLeafKV(TBTree *db, TBTPage *page, TBTPointer *ptr, TBTValue *key, TBTValue *value) {
522 if ( (char*)ptr - page->data != page->npointer * TBTPOINTERSIZE(db) )
523 memmove((char*)ptr+TBTPOINTERSIZE(db), ptr, page->npointer * TBTPOINTERSIZE(db) - ( (char*)ptr - page->data ));
525 ptr = (TBTPointer*)(page->data + page->npointer * TBTPOINTERSIZE(db));
528 page->freespace -= TBTPOINTERSIZE(db) + PTRALIGN(value->length);
530 memcpy( ptr->key.fixed.key, key->value, db->keylen );
531 ptr->pointer.leaf.offset = page->npointer * TBTPOINTERSIZE(db) + page->freespace;
533 page->freespace -= PTRALIGN(key->length);
534 ptr->key.varlen.length = key->length;
535 ptr->key.varlen.offset = page->npointer * TBTPOINTERSIZE(db) + page->freespace;
536 memcpy( page->data + ptr->key.varlen.offset, key->value, key->length);
537 ptr->pointer.leaf.offset = page->npointer * TBTPOINTERSIZE(db) + page->freespace + PTRALIGN(key->length);
539 ptr->pointer.leaf.length = value->length;
540 memcpy( page->data + ptr->pointer.leaf.offset, value->value, value->length );
544 packInternalKV(TBTree *db, TBTPage *page, TBTPointer *ptr, TBTValue *key, u_int32_t pagenumber) {
546 if ( (char*)ptr - page->data != page->npointer * TBTPOINTERSIZE(db) )
547 memmove((char*)ptr+TBTPOINTERSIZE(db), ptr, page->npointer * TBTPOINTERSIZE(db) - ( (char*)ptr - page->data ));
549 ptr = (TBTPointer*)(page->data + page->npointer * TBTPOINTERSIZE(db));
552 page->freespace -= TBTPOINTERSIZE(db);
556 memcpy( ptr->key.fixed.key, key->value, db->keylen );
558 memset( ptr->key.fixed.key, 0, db->keylen );
560 page->freespace -= PTRALIGN(key->length);
561 ptr->key.varlen.length = key->length;
562 ptr->key.varlen.offset = page->npointer * TBTPOINTERSIZE(db) + page->freespace;
564 memcpy( page->data + ptr->key.varlen.offset, key->value, key->length);
566 ptr->pointer.internal.link = pagenumber;
571 deleteKV(TBTree *db, TBTPage *page, TBTPointer **ptrs, u_int32_t num) {
572 static char buf[TBTREEPAGESIZE];
574 TBTPointer *ptr = (TBTPointer*)page->data, *tmp, **ptrptr = ptrs;
576 /* suppose ptrs are ordered! */
578 while( ptrptr - ptrs < num && (char*)ptr - page->data < TBTPOINTERSIZE(db)*page->npointer ) {
579 if ( *ptrptr < ptr ) {
580 tlog(TL_CRIT,"deleteKV: Something wrong: unexistent *ptrs");
582 } else if ( *ptrptr > ptr ) {
584 memcpy(tmp, ptr, TBTPOINTERSIZE(db));
585 ptr = (TBTPointer*)( (char*)ptr + TBTPOINTERSIZE(db) );
586 tmp = (TBTPointer*)( (char*)tmp + TBTPOINTERSIZE(db) );
588 page->freespace += TBTPOINTERSIZE(db) +
589 ( ( page->isleaf ) ? PTRALIGN(ptr->pointer.leaf.length) : 0 ) +
590 ( ( db->keylen ) ? 0 : PTRALIGN(ptr->key.varlen.length) );
592 ptr = (TBTPointer*)( (char*)ptr + TBTPOINTERSIZE(db) );
596 if ( (char*)ptr - page->data < TBTPOINTERSIZE(db)*page->npointer && tmp!=ptr )
597 memmove( tmp, ptr, page->npointer * TBTPOINTERSIZE(db) - ((char*)ptr - page->data) );
601 tmp = (TBTPointer*)page->data;
602 start = page->npointer * TBTPOINTERSIZE(db) + page->freespace;
603 for(i=0;i<page->npointer;i++) {
604 if ( page->isleaf ) {
605 memcpy( bufptr, page->data + tmp->pointer.leaf.offset, tmp->pointer.leaf.length );
606 tmp->pointer.leaf.offset = start + (bufptr-buf);
607 bufptr+=PTRALIGN(tmp->pointer.leaf.length);
610 if ( db->keylen == 0 ) {
611 if ( tmp->key.varlen.length )
612 memcpy( bufptr, page->data + tmp->key.varlen.offset, tmp->key.varlen.length );
613 tmp->key.varlen.offset = start + (bufptr-buf);
614 bufptr+=PTRALIGN(tmp->key.varlen.length);
616 tmp = (TBTPointer*)( (char*)tmp + TBTPOINTERSIZE(db) );
619 memcpy( page->data + start, buf, bufptr-buf );
623 findDelimiter(TBTree *db, TBTPage *page, TBTPointer *ptr, int size, u_int32_t start, int *where) {
624 int *sizes = (int*) tmalloc( sizeof(int) * (page->npointer+1) );
627 int lfree=TBTREEPAGESIZE-TBTPAGEHDRSZ, rfree=TBTREEPAGESIZE-TBTPAGEHDRSZ;
628 int nptr = ( (char*)ptr - page->data ) / TBTPOINTERSIZE(db);
631 for(i=0; i<page->npointer;i++) {
632 tomove = (TBTPointer*)(page->data + i*TBTPOINTERSIZE(db));
633 if ( was==0 && i==nptr ) {
637 sizes[i+was] = TBTPOINTERSIZE(db) +
638 ( (page->isleaf) ? PTRALIGN(tomove->pointer.leaf.length) : 0 ) +
639 ( ( db->keylen ) ? 0 : PTRALIGN(tomove->key.varlen.length) );
644 sizes[page->npointer]=size;
646 for(i=0;i<page->npointer+1;i++) {
658 } else if ( rfree < 0 ) {
666 *where = (nptr < start) ? -1 : 1;
675 populatePages(TBTree *db, TBTPage *srcpage, TBTPage* newpage, u_int32_t start) {
677 TBTPointer *tomove, **todelete = tmalloc( sizeof(TBTPointer*)* srcpage->npointer );
678 u_int32_t numtodelete=0;
681 for(i=start;i<srcpage->npointer;i++) {
682 tomove = (TBTPointer*)(srcpage->data + i*TBTPOINTERSIZE(db));
684 todelete[numtodelete] = tomove;
688 k.value = tomove->key.fixed.key;
690 k.length = tomove->key.varlen.length;
691 k.value = srcpage->data + tomove->key.varlen.offset;
694 if ( srcpage->isleaf ) {
695 v.length = tomove->pointer.leaf.length;
696 v.value = srcpage->data + tomove->pointer.leaf.offset;
697 packLeafKV(db, newpage, NULL, &k, &v);
699 packInternalKV(db, newpage, NULL, &k, tomove->pointer.internal.link);
702 deleteKV(db, srcpage, todelete, numtodelete);
707 splitPage(TBTree *db, TBTMemPage *srcpage, TBTMemPage** newpage, TBTPointer **ptr, int size, TBTValue *key, TBTValue *value, u_int32_t pagenumber) {
709 int start=0, where=0;
710 int nptr = ( (char*)(*ptr) - srcpage->page.data ) / TBTPOINTERSIZE(db);
712 if ( TBTNewPage(db, newpage) )
716 srcpage->issynced=tmp->issynced=0;
718 tmp->page.rightlink = srcpage->page.rightlink;
719 srcpage->page.rightlink = tmp->pagenumber;
720 tmp->page.isleaf = srcpage->page.isleaf;
722 switch(db->strategy) {
723 case TBT_STATEGY_HALF:
724 start = findDelimiter(db, &(srcpage->page), *ptr, size, (srcpage->page.npointer+1)/2, &where);
726 case TBT_STATEGY_LEFTFILL:
727 start = findDelimiter(db, &(srcpage->page), *ptr, size, srcpage->page.npointer, &where);
729 case TBT_STATEGY_RIGHTFILL:
730 start = findDelimiter(db, &(srcpage->page), *ptr, size, 0, &where);
733 tlog(TL_CRIT,"splitPage: unknown strategy: %d", db->strategy);
737 populatePages(db, &(srcpage->page), &(tmp->page), start);
742 *ptr = (TBTPointer*)(tmp->page.data + nptr*TBTPOINTERSIZE(db));
744 if ( tmp->page.isleaf )
745 packLeafKV(db, &(tmp->page), *ptr, key, value);
747 packInternalKV(db, &(tmp->page), *ptr, key, pagenumber);
753 savePage(TBTree *db, TBTMemPage *page) {
757 if ( !page->iscached ) {
758 if ( page->issynced==0 && (rc=TBTWritePage(db, page)) != TBT_OK )
766 layerInsert(TBTree *db, u_int32_t pagenumber, TBTMemPage **left, TBTMemPage **right, TBTValue *key, TBTValue *value) {
768 TBTPointer *ptr=NULL;
769 TBTMemPage *page=NULL, *newright=NULL;
771 if ( (rc=TBTReadPage(db, pagenumber, &page))!=TBT_OK )
776 fnd = findInPage(db, &(page->page), key, &ptr );
779 if ( page->page.isleaf ) {
780 u_int32_t size = TBTPOINTERSIZE(db) +
781 PTRALIGN(value->length) +
782 ( ( db->keylen ) ? 0 : PTRALIGN(key->length) );
784 if ( size>(TBTREEPAGESIZE-TBTPAGEHDRSZ)/2 ) {
785 tlog(TL_ALARM,"Huge size: %d bytes", size);
790 deleteKV(db, &(page->page), &ptr, 1);
794 if ( size <= page->page.freespace ) {
795 packLeafKV(db, &(page->page), ptr, key, value);
798 rc = splitPage(db, page, &newright, &ptr, size, key, value, 0);
799 if ( rc != TBT_OK ) return rc;
801 page->issynced = newright->issynced = 0;
802 page->islocked = newright->islocked = 1;
805 if ( (rc=layerInsert(db, ptr->pointer.internal.link, left, right, key, value))!=TBT_OK ) {
810 if ( *left ) { /* child page was splited */
812 TBTPointer *rightptr = (TBTPointer*)( (*left)->page.data + ((*left)->page.npointer-1) * TBTPOINTERSIZE(db) );
815 if ( ISINFPOINTER(db, &(page->page), ptr) ) {
816 deleteKV(db, &(page->page), &ptr, 1);
817 K.length = 0; K.value = NULL;
818 packInternalKV(db, &(page->page), ptr, &K, (*right)->pagenumber);
820 ptr->pointer.internal.link=(*right)->pagenumber;
822 K.length = ( db->keylen ) ? db->keylen : rightptr->key.varlen.length;
823 K.value = ( db->keylen ) ? rightptr->key.fixed.key : (*left)->page.data + rightptr->key.varlen.offset;
825 size = TBTPOINTERSIZE(db) + ( ( db->keylen ) ? 0 : PTRALIGN(K.length) );
827 if ( size <= page->page.freespace ) {
828 packInternalKV(db, &(page->page), ptr, &K, (*left)->pagenumber);
831 rc = splitPage(db, page, &newright, &ptr, size, &K, NULL, (*left)->pagenumber);
832 if ( rc != TBT_OK ) return rc;
834 page->issynced = newright->issynced = 0;
835 page->islocked = newright->islocked = 1;
841 if ( (rc=savePage(db, *left))!=TBT_OK )
843 if ( (rc=savePage(db, *right))!=TBT_OK )
849 if ( page->pagenumber==TBTPAGEROOT ) { /* root was splited */
854 if ( (rc=TBTNewPage(db,&newleft))!=TBT_OK )
856 memcpy( &(newleft->page) , &(page->page), TBTREEPAGESIZE);
857 memset( &(page->page), 0, TBTREEPAGESIZE);
858 page->page.freespace=TBTREEPAGESIZE-TBTPAGEHDRSZ;
860 ptr = (TBTPointer*)( newleft->page.data + (newleft->page.npointer-1) * TBTPOINTERSIZE(db) );
861 K.length = ( db->keylen ) ? db->keylen : ptr->key.varlen.length;
862 K.value = ( db->keylen ) ? ptr->key.fixed.key : newleft->page.data + ptr->key.varlen.offset;
863 packInternalKV(db, &(page->page), NULL, &K, newleft->pagenumber);
865 ptr = (TBTPointer*)( newright->page.data + (newright->page.npointer-1) * TBTPOINTERSIZE(db) );
868 packInternalKV(db, &(page->page), NULL, &K, newright->pagenumber);
870 if ( (rc=savePage(db, newright))!=TBT_OK )
872 if ( (rc=savePage(db, newleft))!=TBT_OK )
874 if ( (rc=savePage(db, page))!=TBT_OK )
880 } else if ( (rc=savePage(db, page))!=TBT_OK )
888 TBTInsert(TBTree *db, TBTValue *key, TBTValue *value) {
890 TBTMemPage *lpage=NULL, *rpage=NULL;
892 rc = layerInsert(db, TBTPAGEROOT, &lpage, &rpage, key, value);
894 tassert( lpage==NULL && rpage==NULL );
900 TBTDelete(TBTree *db, TBTValue *key) {
905 if ( (rc=findPage(db, key, &page))!=TBT_OK )
911 if ( findInPage(db, &(page->page), key, &ptr) != FOUND ) {
912 if ( !(page)->iscached )
917 deleteKV(db, &(page->page), &ptr, 1);
920 rc=savePage(db, page);
926 TBTInitIterator(TBTree *db, TBTIterator *iterator) {
927 TBTMemPage *page=NULL;
930 u_int32_t pagenum=TBTPAGEROOT;
932 memset(iterator, 0, sizeof(TBTIterator));
935 if ( page && !page->iscached )
937 if ( (rc=TBTReadPage(db, pagenum, &page))!=TBT_OK )
940 ptr = (TBTPointer*)( page->page.data );
941 pagenum = ptr->pointer.internal.link;
942 } while( !page->page.isleaf );
945 iterator->page = page;
952 TBTFreeIterator(TBTree *db, TBTIterator *iterator) {
954 if ( iterator->page ) {
955 iterator->page->islocked=0;
956 if ( !iterator->page->iscached )
957 tfree(iterator->page);
960 memset(iterator, 0, sizeof(TBTIterator));
964 TBTIterate(TBTree *db, TBTIterator *iterator, TBTValue *key, TBTValue *value ) {
967 if ( iterator->page==NULL ) {
968 memset(key, 0, sizeof(TBTValue));
969 memset(value, 0, sizeof(TBTValue));
973 if ( (char*)(iterator->ptr) - iterator->page->page.data >= TBTPOINTERSIZE(db) * iterator->page->page.npointer ) {
975 u_int32_t pagenumber = iterator->page->page.rightlink;
976 iterator->page->islocked=0;
977 if ( !iterator->page->iscached )
978 tfree(iterator->page);
981 if ( pagenumber==TBTPAGEROOT ) {
982 memset(key, 0, sizeof(TBTValue));
983 memset(value, 0, sizeof(TBTValue));
986 if ( (rc=TBTReadPage(db, pagenumber, &(iterator->page)))!=TBT_OK )
988 } while ( iterator->page->page.npointer == 0 );
990 iterator->page->islocked=1;
991 iterator->ptr = (TBTPointer*)( iterator->page->page.data );
995 key->length = db->keylen;
996 key->value = iterator->ptr->key.fixed.key;
998 key->length = iterator->ptr->key.varlen.length;
999 key->value = iterator->page->page.data + iterator->ptr->key.varlen.offset;
1002 value->length = iterator->ptr->pointer.leaf.length;
1003 value->value = iterator->page->page.data + iterator->ptr->pointer.leaf.offset;
1005 iterator->ptr = ( TBTPointer* ) ( (char*)(iterator->ptr) + TBTPOINTERSIZE(db) );