/* * Copyright (c) 2005 Teodor Sigaev * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY CONTRIBUTORS ``AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include "tmalloc.h" #include "tlog.h" #include "tbtree.h" #define NOTFOUND 0 #define FOUND 1 static int TBTReadPage(TBTree *db, u_int32_t pagenumber, TBTMemPage **page) ; static int TBTWritePage(TBTree *db, TBTMemPage *page) ; static int TBTNewPage(TBTree *db, TBTMemPage **page) ; int TBTOpen(TBTree *db, char *file) { off_t offset; if ( db->readonly ) { db->fd = open(file, O_RDONLY); if ( db->fd < 0 ) { tlog(TL_CRIT,"TBTOpen: open failed: %s",strerror(errno)); return TBT_ERROR; } if ( flock( db->fd, LOCK_SH ) < 0 ) { tlog(TL_CRIT,"TBTOpen: flock failed: %s",strerror(errno)); close(db->fd); return TBT_ERROR; } } else { db->fd = open(file, O_CREAT | O_RDWR, 0666); if ( db->fd < 0 ) { tlog(TL_CRIT,"TBTOpen: open failed: %s",strerror(errno)); return TBT_ERROR; } if ( flock( db->fd, LOCK_EX ) < 0 ) { tlog(TL_CRIT,"TBTOpen: flock failed: %s",strerror(errno)); close(db->fd); return TBT_ERROR; } } if ( db->npage ) { db->Cache=(TBTMemPage**)t0malloc( sizeof(TBTMemPage*)*HASHSIZE(db->npage) ); db->curpage=0; } db->cachehits = db->pageread = db->pagewrite = 0; if ( (offset=lseek(db->fd, 0, SEEK_END)) < 0 ) { tlog(TL_CRIT,"TBTOpen: lseek failed: %s",strerror(errno)); flock( db->fd, LOCK_UN ); close(db->fd); return TBT_ERROR; } if ( offset==0 ) { if ( db->readonly ) { tlog(TL_CRIT,"TBTOpen: file is void"); flock( db->fd, LOCK_UN ); close(db->fd); return TBT_ERROR; } else { TBTMemPage *page; if ( TBTNewPage(db, &page) != TBT_OK ) { flock( db->fd, LOCK_UN ); close(db->fd); return TBT_ERROR; } page->page.isleaf=1; if ( TBTWritePage(db, page)!=TBT_OK ){ flock( db->fd, LOCK_UN ); close(db->fd); return TBT_ERROR; } } } db->pointersize = TBTPOINTERSIZE(db); db->lastpagenumber=0; return TBT_OK; } int TBTClose(TBTree *db) { int rc=0; if ( !db->readonly ) rc=TBTSync(db); if ( db->npage ) { u_int32_t i; TBTMemPage *ptr,*tmp; for(i=0; inpage); i++) { ptr = db->Cache[i]; while(ptr) { tmp = ptr->link; tfree(ptr); ptr=tmp; } } tfree( db->Cache ); } flock( db->fd, LOCK_UN ); close(db->fd); return (rc) ? TBT_ERROR : TBT_OK; } static int cmpNPage(const void *a, const void *b) { tassert( (*(TBTMemPage**)a)->pagenumber != (*(TBTMemPage**)b)->pagenumber ); return ( (*(TBTMemPage**)a)->pagenumber > (*(TBTMemPage**)b)->pagenumber ) ? 1 : -1; } int TBTSync(TBTree *db) { int rc=0; if ( db->readonly || db->npage==0 ) return TBT_OK; if ( db->npage ) { u_int32_t i, total=0; TBTMemPage *ptr, **data = (TBTMemPage**)tmalloc(db->npage*sizeof(TBTMemPage*)), **pptr; pptr=data; for(i=0; inpage); i++) { ptr = db->Cache[i]; while(ptr) { if ( !ptr->issynced ) { *pptr = ptr; pptr++; } ptr = ptr->link; } } total=pptr-data; if ( total > 0 ) { if ( total>1 ) qsort( data, total, sizeof(TBTMemPage*), cmpNPage); pptr=data; while( pptr-datafd) ) { tlog(TL_CRIT,"TBTSync: fsync failed: %s",strerror(errno)); rc=TBT_ERROR; } return (rc) ? TBT_ERROR : TBT_OK; } static TBTMemPage * findInCache(TBTree *db, u_int32_t pagenumber) { TBTMemPage *ptr; ptr = db->Cache[ pagenumber % HASHSIZE(db->npage) ]; while(ptr && ptr->pagenumber != pagenumber) ptr = ptr->link; return ptr; } static void removeFromCache(TBTree *db, u_int32_t pagenumber) { TBTMemPage *ptr, *prev=NULL; int pos = pagenumber % HASHSIZE(db->npage); ptr = db->Cache[ pos ]; while(ptr) { if ( ptr->pagenumber == pagenumber ) { if ( prev ) prev->link = ptr->link; else db->Cache[ pos ] = ptr->link; break; } prev=ptr; ptr = ptr->link; } } static void insertInCache(TBTree *db, TBTMemPage *page) { int pos = page->pagenumber % HASHSIZE(db->npage); page->link = db->Cache[ pos ]; db->Cache[ pos ] = page; } static TBTMemPage* PageAlloc(TBTree *db) { TBTMemPage *page; if ( db->curpage < db->npage ) { page = (TBTMemPage*)tmalloc(sizeof(TBTMemPage)); memset( page, 0, TBTMEMPAGEHDRSZ ); page->iscached=1; if ( db->curpage == 0 ) { db->TimeCache = db->TimeCacheLast = page; } else { db->TimeCacheLast->next = page; page->prev = db->TimeCacheLast; db->TimeCacheLast = page; } db->curpage++; } else if ( db->npage == 0 ) { page = (TBTMemPage*)tmalloc(sizeof(TBTMemPage)); memset( page, 0, TBTMEMPAGEHDRSZ ); } else { page = db->TimeCache; while( page->next && page->islocked ) page = page->next; if ( page==db->TimeCacheLast && page->islocked ) { /*all pages locked*/ /* tlog(TL_WARN,"TBTReadPage: all pages in cache locked, increase number of cached pages"); */ page = (TBTMemPage*)tmalloc(sizeof(TBTMemPage)); memset( page, 0, TBTMEMPAGEHDRSZ ); } else { if ( !page->issynced ) if ( TBTWritePage(db, page) ) return NULL; removeFromCache(db, page->pagenumber); if ( page != db->TimeCacheLast ) { if ( page == db->TimeCache ) { db->TimeCache = page->next; db->TimeCache->prev = NULL; } else { page->prev->next = page->next; page->next->prev = page->prev; } memset( page, 0, TBTMEMPAGEHDRSZ ); db->TimeCacheLast->next = page; page->prev = db->TimeCacheLast; db->TimeCacheLast = page; } else memset( page, 0, TBTMEMPAGEHDRSZ ); page->iscached=1; } } return page; } static int TBTReadPage(TBTree *db, u_int32_t pagenumber, TBTMemPage **page) { int rc; if ( db->npage && (*page=findInCache(db, pagenumber))!=NULL ) { db->cachehits++; if ( *page != db->TimeCacheLast ) { if ( (*page) == db->TimeCache ) { db->TimeCache = (*page)->next; db->TimeCache->prev=NULL; } else { (*page)->prev->next = (*page)->next; (*page)->next->prev = (*page)->prev; } db->TimeCacheLast->next = (*page); (*page)->prev = db->TimeCacheLast; db->TimeCacheLast = *page; (*page)->next = NULL; } } else { off_t offset = (off_t)pagenumber * (off_t)TBTREEPAGESIZE; if( (*page = PageAlloc(db))==NULL ) return TBT_ERROR; if ( lseek(db->fd, offset, SEEK_SET) != offset ) { tlog(TL_CRIT,"TBTReadPage: lseek failed: %s",strerror(errno)); return TBT_ERROR; } (*page)->issynced=1; (*page)->pagenumber=pagenumber; if ( (rc=read(db->fd, &( (*page)->page ), TBTREEPAGESIZE)) != TBTREEPAGESIZE ) { tlog(TL_CRIT,"TBTReadPage: read failed: %s %d %d",strerror(errno), pagenumber, offset); return TBT_ERROR; } if ( (*page)->iscached ) insertInCache(db, *page); } db->pageread++; return TBT_OK; } static int TBTWritePage(TBTree *db, TBTMemPage *page) { off_t offset = (off_t)(page->pagenumber) * (off_t)TBTREEPAGESIZE; off_t size; if ( page->issynced ) return TBT_OK; if ( lseek(db->fd, offset, SEEK_SET) != offset ) { tlog(TL_CRIT,"TBTWritePage: lseek failed: %s",strerror(errno)); return TBT_ERROR; } if ( (size=write(db->fd, &( page->page ), TBTREEPAGESIZE)) != TBTREEPAGESIZE ) { tlog(TL_CRIT,"TBTWritePage: write failed: %s (writed only %d bytes)",strerror(errno), size); return TBT_ERROR; } page->issynced=1; db->pagewrite++; return TBT_OK; } static int TBTNewPage(TBTree *db, TBTMemPage **page) { if ( db->lastpagenumber==0 ) { off_t offset; if ( (offset=lseek(db->fd, 0, SEEK_END)) < 0 ) { tlog(TL_CRIT,"TBTNewPage: lseek failed: %s",strerror(errno)); return TBT_ERROR; } if ( offset % TBTREEPAGESIZE != 0 ) { tlog(TL_CRIT,"TBTNewPage: file corrupted"); return TBT_ERROR; } db->lastpagenumber = offset/TBTREEPAGESIZE; } if( (*page = PageAlloc(db))==NULL ) return TBT_ERROR; (*page)->pagenumber = db->lastpagenumber; (*page)->issynced=0; (*page)->islocked=1; db->lastpagenumber++; memset( &((*page)->page), 0, TBTMEMPAGEHDRSZ); (*page)->page.freespace = TBTREEPAGESIZE-TBTPAGEHDRSZ; if ( (*page)->iscached ) insertInCache(db, *page); return TBT_OK; } static void printValue(u_int32_t len, char *ptr) { putchar('\''); while(len>0) { putchar(*ptr); ptr++; len--; } putchar('\''); } static void dumpPage(TBTree *db, TBTPage *page, int follow) { u_int32_t i,j; for(j=0;jfreespace, page->isleaf, page->rightlink, page->npointer ); for(i=0;inpointer;i++) { TBTPointer *ptr= (TBTPointer*)(page->data+db->pointersize*i); for(j=0;jkeylen) ? db->keylen : ptr->key.varlen.length, (db->keylen) ? 0 : ptr->key.varlen.offset ); if ( db->keylen==0 ) printValue(ptr->key.varlen.length, page->data + ptr->key.varlen.offset); else printf("'%d'", *(int*)(ptr->key.fixed.key)); if ( page->isleaf ) { printf(" vl:%d vo:%d ", ptr->pointer.leaf.length, ptr->pointer.leaf.offset ); printValue(ptr->pointer.leaf.length, page->data + ptr->pointer.leaf.offset ); putchar('\n'); } else { printf(" page:%d\n", ptr->pointer.internal.link); if ( follow>=0 ) dumpTree(db, ptr->pointer.internal.link, follow+4); } } } void dumpTree(TBTree *db, u_int32_t pagenumber, int follow) { TBTMemPage *page; if ( TBTReadPage(db, pagenumber, &page) ) { tlog(TL_CRIT,"BEDA %d\n", pagenumber); return; } page->islocked=1; dumpPage(db, &(page->page), follow); page->islocked=0; if ( !page->iscached ) tfree(page); } static int findInPage(TBTree *db, TBTPage *page, TBTValue *key, TBTPointer **res) { TBTPointer *StopLow = (TBTPointer*)(page->data); TBTPointer *StopHigh = (TBTPointer*)(page->data + db->pointersize*page->npointer); TBTPointer *StopMiddle; int diff ; TBTValue A; A.length = db->keylen; /* Loop invariant: StopLow <= StopMiddle < StopHigh */ while (StopLow < StopHigh) { StopMiddle = (TBTPointer*)(((char*)StopLow) + db->pointersize*((((char*)StopHigh - (char*)StopLow)/db->pointersize) >> 1)); if ( db->keylen == 0 ) { A.length = StopMiddle->key.varlen.length; A.value = page->data + StopMiddle->key.varlen.offset; } else A.value = StopMiddle->key.fixed.key; if ( ISINFPOINTER(db, page, StopMiddle) ) diff = 1; else diff = db->cmpkey( &A, key ); if ( diff == 0 ) { *res = StopMiddle; return FOUND; } else if ( diff < 0 ) StopLow = (TBTPointer*)((char*)StopMiddle + db->pointersize); else StopHigh = StopMiddle; } *res = StopHigh; return NOTFOUND; } static int findPage(TBTree *db, TBTValue *key, TBTMemPage **page) { u_int32_t pagenumber = TBTPAGEROOT; int rc = TBT_OK; *page=NULL; while(1) { TBTPointer *ptr; if ( (rc=TBTReadPage(db, pagenumber, page))!=TBT_OK ) return rc; if ( (*page)->page.isleaf ) return TBT_OK; findInPage(db, &((*page)->page), key, &ptr ); pagenumber = ptr->pointer.internal.link; if ( !(*page)->iscached ) tfree(*page); } return TBT_OK; } int TBTFind(TBTree *db, TBTValue *key, TBTValue *value) { TBTMemPage *page; TBTPointer *ptr; int rc; memset(value, 0, sizeof(TBTValue)); if ( (rc=findPage(db, key, &page))!=TBT_OK ) return rc; if ( page == NULL ) return TBT_OK; if ( findInPage(db, &(page->page), key, &ptr) != FOUND ) { if ( !(page)->iscached ) tfree(page); return TBT_OK; } value->length = ptr->pointer.leaf.length; value->value = tmalloc(value->length); memcpy( value->value, page->page.data + ptr->pointer.leaf.offset, value->length ); if ( !(page)->iscached ) tfree(page); return TBT_OK; } static void packLeafKV(TBTree *db, TBTPage *page, TBTPointer *ptr, TBTValue *key, TBTValue *value) { if ( ptr ) { if ( (char*)ptr - page->data != page->npointer * db->pointersize ) memmove((char*)ptr+db->pointersize, ptr, page->npointer * db->pointersize - ( (char*)ptr - page->data )); } else ptr = (TBTPointer*)(page->data + page->npointer * db->pointersize); page->npointer++; page->freespace -= db->pointersize + PTRALIGN(value->length); if ( db->keylen ) { memcpy( ptr->key.fixed.key, key->value, db->keylen ); ptr->pointer.leaf.offset = page->npointer * db->pointersize + page->freespace; } else { page->freespace -= PTRALIGN(key->length); ptr->key.varlen.length = key->length; ptr->key.varlen.offset = page->npointer * db->pointersize + page->freespace; memcpy( page->data + ptr->key.varlen.offset, key->value, key->length); ptr->pointer.leaf.offset = page->npointer * db->pointersize + page->freespace + PTRALIGN(key->length); } ptr->pointer.leaf.length = value->length; memcpy( page->data + ptr->pointer.leaf.offset, value->value, value->length ); } static void packInternalKV(TBTree *db, TBTPage *page, TBTPointer *ptr, TBTValue *key, u_int32_t pagenumber) { if ( ptr ) { if ( (char*)ptr - page->data != page->npointer * db->pointersize ) memmove((char*)ptr+db->pointersize, ptr, page->npointer * db->pointersize - ( (char*)ptr - page->data )); } else ptr = (TBTPointer*)(page->data + page->npointer * db->pointersize); page->npointer++; page->freespace -= db->pointersize; if ( db->keylen ) { if ( key->value ) memcpy( ptr->key.fixed.key, key->value, db->keylen ); } else { page->freespace -= PTRALIGN(key->length); ptr->key.varlen.length = key->length; ptr->key.varlen.offset = page->npointer * db->pointersize + page->freespace; if ( key->length ) memcpy( page->data + ptr->key.varlen.offset, key->value, key->length); } ptr->pointer.internal.link = pagenumber; } static void deleteKV(TBTree *db, TBTPage *page, TBTPointer **ptrs, u_int32_t num) { TBTPointer *ptr = (TBTPointer*)page->data, *tmp, **ptrptr = ptrs; u_int32_t i; /* suppose ptrs are ordered! */ tmp = ptr; while( ptrptr - ptrs < num && (char*)ptr - page->data < db->pointersize*page->npointer ) { if ( *ptrptr < ptr ) { tlog(TL_CRIT,"deleteKV: Something wrong: unexistent *ptrs"); ptrptr++; } else if ( *ptrptr > ptr ) { if ( tmp != ptr ) memcpy(tmp, ptr, db->pointersize); ptr = (TBTPointer*)( (char*)ptr + db->pointersize ); tmp = (TBTPointer*)( (char*)tmp + db->pointersize ); } else { page->freespace += db->pointersize + ( ( page->isleaf ) ? PTRALIGN(ptr->pointer.leaf.length) : 0 ) + ( ( db->keylen ) ? 0 : PTRALIGN(ptr->key.varlen.length) ); ptrptr++; ptr = (TBTPointer*)( (char*)ptr + db->pointersize ); } } if ( (char*)ptr - page->data < db->pointersize*page->npointer && tmp!=ptr ) memmove( tmp, ptr, page->npointer * db->pointersize - ((char*)ptr - page->data) ); page->npointer-=num; if ( page->isleaf || db->keylen == 0 ) { static char buf[TBTREEPAGESIZE]; char *bufptr = buf; u_int32_t start = page->npointer * db->pointersize + page->freespace; tmp = (TBTPointer*)page->data; for(i=0;inpointer;i++) { if ( page->isleaf ) { memcpy( bufptr, page->data + tmp->pointer.leaf.offset, tmp->pointer.leaf.length ); tmp->pointer.leaf.offset = start + (bufptr-buf); bufptr+=PTRALIGN(tmp->pointer.leaf.length); } if ( db->keylen == 0 ) { if ( tmp->key.varlen.length ) memcpy( bufptr, page->data + tmp->key.varlen.offset, tmp->key.varlen.length ); tmp->key.varlen.offset = start + (bufptr-buf); bufptr+=PTRALIGN(tmp->key.varlen.length); } tmp = (TBTPointer*)( (char*)tmp + db->pointersize ); } memcpy( page->data + start, buf, bufptr-buf ); } } static u_int32_t findDelimiter(TBTree *db, TBTPage *page, TBTPointer *ptr, int size, u_int32_t start, int *where) { int *sizes = (int*) tmalloc( sizeof(int) * (page->npointer+1) ); int i; TBTPointer *tomove; int lfree=TBTREEPAGESIZE-TBTPAGEHDRSZ, rfree=TBTREEPAGESIZE-TBTPAGEHDRSZ; int nptr = ( (char*)ptr - page->data ) / db->pointersize; int was=0; for(i=0; inpointer;i++) { tomove = (TBTPointer*)(page->data + i*db->pointersize); if ( was==0 && i==nptr ) { sizes[i] = size; was=1; } sizes[i+was] = db->pointersize + ( (page->isleaf) ? PTRALIGN(tomove->pointer.leaf.length) : 0 ) + ( ( db->keylen ) ? 0 : PTRALIGN(tomove->key.varlen.length) ); } if ( was==0 ) { sizes[page->npointer]=size; } for(i=0;inpointer+1;i++) if ( i< start ) lfree-=sizes[i]; else rfree-=sizes[i]; while( 1 ) { if ( lfree<0 ) { start--; lfree+=sizes[start]; rfree-=sizes[start]; } else if ( rfree < 0 ) { lfree-=sizes[start]; rfree+=sizes[start]; start++; } else break; } *where = (nptr < start) ? -1 : 1; if ( start>nptr ) start--; tfree(sizes); return start; } static void populatePages(TBTree *db, TBTPage *srcpage, TBTPage* newpage, u_int32_t start) { u_int32_t i; TBTPointer *tomove, **todelete = tmalloc( sizeof(TBTPointer*)* srcpage->npointer ); u_int32_t numtodelete=0; TBTValue k, v; for(i=start;inpointer;i++) { tomove = (TBTPointer*)(srcpage->data + i*db->pointersize); todelete[numtodelete] = tomove; numtodelete++; if ( db->keylen ) k.value = tomove->key.fixed.key; else { k.length = tomove->key.varlen.length; k.value = srcpage->data + tomove->key.varlen.offset; } if ( srcpage->isleaf ) { v.length = tomove->pointer.leaf.length; v.value = srcpage->data + tomove->pointer.leaf.offset; packLeafKV(db, newpage, NULL, &k, &v); } else packInternalKV(db, newpage, NULL, &k, tomove->pointer.internal.link); } deleteKV(db, srcpage, todelete, numtodelete); tfree(todelete); } static int splitPage(TBTree *db, TBTMemPage *srcpage, TBTMemPage** newpage, TBTPointer **ptr, int size, TBTValue *key, TBTValue *value, u_int32_t pagenumber) { TBTMemPage *tmp; int start=0, where=0; int nptr = ( (char*)(*ptr) - srcpage->page.data ) / db->pointersize; if ( TBTNewPage(db, newpage) ) return TBT_ERROR; tmp = *newpage; srcpage->issynced=tmp->issynced=0; tmp->islocked=1; tmp->page.rightlink = srcpage->page.rightlink; srcpage->page.rightlink = tmp->pagenumber; tmp->page.isleaf = srcpage->page.isleaf; switch(db->strategy) { case TBT_STATEGY_HALF: start = findDelimiter(db, &(srcpage->page), *ptr, size, (srcpage->page.npointer+1)/2, &where); break; case TBT_STATEGY_LEFTFILL: start = findDelimiter(db, &(srcpage->page), *ptr, size, srcpage->page.npointer, &where); break; case TBT_STATEGY_RIGHTFILL: start = findDelimiter(db, &(srcpage->page), *ptr, size, 0, &where); break; default: tlog(TL_CRIT,"splitPage: unknown strategy: %d", db->strategy); return TBT_ERROR; } populatePages(db, &(srcpage->page), &(tmp->page), start); if ( where<0 ) tmp=srcpage; else nptr-=start; *ptr = (TBTPointer*)(tmp->page.data + nptr*db->pointersize); if ( tmp->page.isleaf ) packLeafKV(db, &(tmp->page), *ptr, key, value); else packInternalKV(db, &(tmp->page), *ptr, key, pagenumber); return TBT_OK; } static int savePage(TBTree *db, TBTMemPage *page) { int rc; page->islocked=0; if ( !page->iscached ) { if ( page->issynced==0 && (rc=TBTWritePage(db, page)) != TBT_OK ) return rc; tfree(page); } return TBT_OK; } static int layerInsert(TBTree *db, u_int32_t pagenumber, TBTMemPage **left, TBTMemPage **right, TBTValue *key, TBTValue *value) { int rc,fnd; TBTPointer *ptr=NULL; TBTMemPage *page=NULL, *newright=NULL; if ( (rc=TBTReadPage(db, pagenumber, &page))!=TBT_OK ) return rc; page->islocked=1; fnd = findInPage(db, &(page->page), key, &ptr ); *left=*right=NULL; if ( page->page.isleaf ) { u_int32_t size = db->pointersize + PTRALIGN(value->length) + ( ( db->keylen ) ? 0 : PTRALIGN(key->length) ); if ( size>(TBTREEPAGESIZE-TBTPAGEHDRSZ)/2 ) { tlog(TL_ALARM,"Huge size: %d bytes", size); return TBT_HUGE; } if (fnd==FOUND) { deleteKV(db, &(page->page), &ptr, 1); page->issynced=0; } if ( size <= page->page.freespace ) { u_int32_t oldsize=page->page.freespace; packLeafKV(db, &(page->page), ptr, key, value); tassert( oldsize == page->page.freespace + size ); page->issynced=0; } else { rc = splitPage(db, page, &newright, &ptr, size, key, value, 0); if ( rc != TBT_OK ) return rc; page->issynced = newright->issynced = 0; page->islocked = newright->islocked = 1; } } else { if ( (rc=layerInsert(db, ptr->pointer.internal.link, left, right, key, value))!=TBT_OK ) { page->islocked=0; return rc; } if ( *left ) { /* child page was splited */ u_int32_t size; TBTPointer *rightptr = (TBTPointer*)( (*left)->page.data + ((*left)->page.npointer-1) * db->pointersize ); TBTValue K; if ( ISINFPOINTER(db, &(page->page), ptr) ) { deleteKV(db, &(page->page), &ptr, 1); K.length = 0; K.value = NULL; packInternalKV(db, &(page->page), ptr, &K, (*right)->pagenumber); } else ptr->pointer.internal.link=(*right)->pagenumber; K.length = ( db->keylen ) ? db->keylen : rightptr->key.varlen.length; K.value = ( db->keylen ) ? rightptr->key.fixed.key : (*left)->page.data + rightptr->key.varlen.offset; size = db->pointersize + ( ( db->keylen ) ? 0 : PTRALIGN(K.length) ); if ( size <= page->page.freespace ) { packInternalKV(db, &(page->page), ptr, &K, (*left)->pagenumber); page->issynced=0; } else { rc = splitPage(db, page, &newright, &ptr, size, &K, NULL, (*left)->pagenumber); if ( rc != TBT_OK ) return rc; page->issynced = newright->issynced = 0; page->islocked = newright->islocked = 1; } } } if ( *left ) { if ( (rc=savePage(db, *left))!=TBT_OK ) return rc; if ( (rc=savePage(db, *right))!=TBT_OK ) return rc; *left=*right=NULL; } if ( newright ) { if ( page->pagenumber==TBTPAGEROOT ) { /* root was splited */ TBTMemPage *newleft; TBTValue K; if ( (rc=TBTNewPage(db,&newleft))!=TBT_OK ) return rc; memcpy( &(newleft->page) , &(page->page), TBTREEPAGESIZE); memset( &(page->page), 0, TBTREEPAGESIZE); page->page.freespace=TBTREEPAGESIZE-TBTPAGEHDRSZ; ptr = (TBTPointer*)( newleft->page.data + (newleft->page.npointer-1) * db->pointersize ); K.length = ( db->keylen ) ? db->keylen : ptr->key.varlen.length; K.value = ( db->keylen ) ? ptr->key.fixed.key : newleft->page.data + ptr->key.varlen.offset; packInternalKV(db, &(page->page), NULL, &K, newleft->pagenumber); ptr = (TBTPointer*)( newright->page.data + (newright->page.npointer-1) * db->pointersize ); K.length = 0; K.value = NULL; packInternalKV(db, &(page->page), NULL, &K, newright->pagenumber); if ( (rc=savePage(db, newright))!=TBT_OK ) return rc; if ( (rc=savePage(db, newleft))!=TBT_OK ) return rc; if ( (rc=savePage(db, page))!=TBT_OK ) return rc; } else { *left = page; *right = newright; } } else if ( (rc=savePage(db, page))!=TBT_OK ) return rc; return TBT_OK; } int TBTInsert(TBTree *db, TBTValue *key, TBTValue *value) { int rc; TBTMemPage *lpage=NULL, *rpage=NULL; rc = layerInsert(db, TBTPAGEROOT, &lpage, &rpage, key, value); tassert( lpage==NULL && rpage==NULL ); return rc; } int TBTDelete(TBTree *db, TBTValue *key) { TBTMemPage *page; TBTPointer *ptr; int rc; if ( (rc=findPage(db, key, &page))!=TBT_OK ) return rc; if ( page == NULL ) return TBT_OK; if ( findInPage(db, &(page->page), key, &ptr) != FOUND ) { if ( !(page)->iscached ) tfree(page); return TBT_OK; } deleteKV(db, &(page->page), &ptr, 1); page->issynced=0; rc=savePage(db, page); return rc; } int TBTInitIterator(TBTree *db, TBTIterator *iterator) { TBTMemPage *page=NULL; TBTPointer *ptr; int rc; u_int32_t pagenum=TBTPAGEROOT; memset(iterator, 0, sizeof(TBTIterator)); do { if ( page && !page->iscached ) tfree(page); if ( (rc=TBTReadPage(db, pagenum, &page))!=TBT_OK ) return rc; ptr = (TBTPointer*)( page->page.data ); pagenum = ptr->pointer.internal.link; } while( !page->page.isleaf ); page->islocked=1; iterator->page = page; iterator->ptr = ptr; return TBT_OK; } void TBTFreeIterator(TBTree *db, TBTIterator *iterator) { if ( iterator->page ) { iterator->page->islocked=0; if ( !iterator->page->iscached ) tfree(iterator->page); } memset(iterator, 0, sizeof(TBTIterator)); } int TBTIterate(TBTree *db, TBTIterator *iterator, TBTValue *key, TBTValue *value ) { int rc; if ( iterator->page==NULL ) { memset(key, 0, sizeof(TBTValue)); memset(value, 0, sizeof(TBTValue)); return TBT_OK; } if ( (char*)(iterator->ptr) - iterator->page->page.data >= db->pointersize * iterator->page->page.npointer ) { do { u_int32_t pagenumber = iterator->page->page.rightlink; iterator->page->islocked=0; if ( !iterator->page->iscached ) tfree(iterator->page); iterator->page=NULL; if ( pagenumber==TBTPAGEROOT ) { memset(key, 0, sizeof(TBTValue)); memset(value, 0, sizeof(TBTValue)); return TBT_OK; } if ( (rc=TBTReadPage(db, pagenumber, &(iterator->page)))!=TBT_OK ) return rc; } while ( iterator->page->page.npointer == 0 ); iterator->page->islocked=1; iterator->ptr = (TBTPointer*)( iterator->page->page.data ); } if ( db->keylen ) { key->length = db->keylen; key->value = iterator->ptr->key.fixed.key; } else { key->length = iterator->ptr->key.varlen.length; key->value = iterator->page->page.data + iterator->ptr->key.varlen.offset; } value->length = iterator->ptr->pointer.leaf.length; value->value = iterator->page->page.data + iterator->ptr->pointer.leaf.offset; iterator->ptr = ( TBTPointer* ) ( (char*)(iterator->ptr) + db->pointersize ); return TBT_OK; } int TBTGetFirst(TBTree *db, TBTValue *key, TBTValue *value) { TBTIterator iterator; int rc; if ( (rc=TBTInitIterator(db, &iterator))!=TBT_OK ) return rc; rc=TBTIterate(db, &iterator, key, value); if ( key->value ) { char * ptr; ptr = tmalloc( key->length ); memcpy( ptr, key->value, key->length ); key->value = ptr; ptr = tmalloc( value->length ); memcpy( ptr, value->value, value->length ); value->value = ptr; } TBTFreeIterator(db, &iterator); return rc; } static int findLast(TBTree *db, TBTValue *key, TBTValue *value, u_int32_t pagenumber) { TBTMemPage *page; TBTPointer *ptr; int rc=TBT_OK,i; if ( (rc=TBTReadPage(db, pagenumber, &page)) != TBT_OK ) return rc; if ( page->page.npointer==0 ) { if ( !page->iscached ) tfree(page); return TBT_OK; } page->islocked=1; if ( page->page.isleaf ) { ptr = (TBTPointer*)(page->page.data + db->pointersize * (page->page.npointer-1)); key->length = ( db->keylen ) ? db->keylen : ptr->key.varlen.length; key->value = tmalloc( key->length ); memcpy( key->value, ( db->keylen ) ? ptr->key.fixed.key : page->page.data + ptr->key.varlen.offset, key->length ); value->length = ptr->pointer.leaf.length; value->value = tmalloc( value->length ); memcpy( value->value, page->page.data + ptr->pointer.leaf.offset, value->length ); } else { for(i=page->page.npointer-1; i>=0; i--) { ptr = (TBTPointer*)( page->page.data + db->pointersize * i ); rc = findLast(db, key, value, ptr->pointer.internal.link); if ( rc!=TBT_OK || key->value ) break; } } page->islocked=0; if ( !page->iscached ) tfree(page); return rc; } int TBTGetLast(TBTree *db, TBTValue *key, TBTValue *value) { memset(key, 0, sizeof(TBTValue)); memset(value, 0, sizeof(TBTValue)); return findLast(db, key, value, TBTPAGEROOT); }