mirror of
https://github.com/tursodatabase/libsql.git
synced 2025-01-09 08:49:02 +00:00
6196 lines
180 KiB
C
6196 lines
180 KiB
C
/*
|
|
** 2011-08-14
|
|
**
|
|
** The author disclaims copyright to this source code. In place of
|
|
** a legal notice, here is a blessing:
|
|
**
|
|
** May you do good and not evil.
|
|
** May you find forgiveness for yourself and forgive others.
|
|
** May you share freely, never taking more than you give.
|
|
**
|
|
*************************************************************************
|
|
**
|
|
** PAGE FORMAT:
|
|
**
|
|
** The maximum page size is 65536 bytes.
|
|
**
|
|
** Since all records are equal to or larger than 2 bytes in size, and
|
|
** some space within the page is consumed by the page footer, there must
|
|
** be less than 2^15 records on each page.
|
|
**
|
|
** Each page ends with a footer that describes the pages contents. This
|
|
** footer serves as similar purpose to the page header in an SQLite database.
|
|
** A footer is used instead of a header because it makes it easier to
|
|
** populate a new page based on a sorted list of key/value pairs.
|
|
**
|
|
** The footer consists of the following values (starting at the end of
|
|
** the page and continuing backwards towards the start). All values are
|
|
** stored as unsigned big-endian integers.
|
|
**
|
|
** * Number of records on page (2 bytes).
|
|
** * Flags field (2 bytes).
|
|
** * Left-hand pointer value (8 bytes).
|
|
** * The starting offset of each record (2 bytes per record).
|
|
**
|
|
** Records may span pages. Unless it happens to be an exact fit, the part
|
|
** of the final record that starts on page X that does not fit on page X
|
|
** is stored at the start of page (X+1). This means there may be pages where
|
|
** (N==0). And on most pages the first record that starts on the page will
|
|
** not start at byte offset 0. For example:
|
|
**
|
|
** aaaaa bbbbb ccc <footer> cc eeeee fffff g <footer> gggg....
|
|
**
|
|
** RECORD FORMAT:
|
|
**
|
|
** The first byte of the record is a flags byte. It is a combination
|
|
** of the following flags (defined in lsmInt.h):
|
|
**
|
|
** LSM_START_DELETE
|
|
** LSM_END_DELETE
|
|
** LSM_POINT_DELETE
|
|
** LSM_INSERT
|
|
** LSM_SEPARATOR
|
|
** LSM_SYSTEMKEY
|
|
**
|
|
** Immediately following the type byte is a pointer to the smallest key
|
|
** in the next file that is larger than the key in the current record. The
|
|
** pointer is encoded as a varint. When added to the 32-bit page number
|
|
** stored in the footer, it is the page number of the page that contains the
|
|
** smallest key in the next sorted file that is larger than this key.
|
|
**
|
|
** Next is the number of bytes in the key, encoded as a varint.
|
|
**
|
|
** If the LSM_INSERT flag is set, the number of bytes in the value, as
|
|
** a varint, is next.
|
|
**
|
|
** Finally, the blob of data containing the key, and for LSM_INSERT
|
|
** records, the value as well.
|
|
*/
|
|
|
|
#ifndef _LSM_INT_H
|
|
# include "lsmInt.h"
|
|
#endif
|
|
|
|
#define LSM_LOG_STRUCTURE 0
|
|
#define LSM_LOG_DATA 0
|
|
|
|
/*
|
|
** Macros to help decode record types.
|
|
*/
|
|
#define rtTopic(eType) ((eType) & LSM_SYSTEMKEY)
|
|
#define rtIsDelete(eType) (((eType) & 0x0F)==LSM_POINT_DELETE)
|
|
|
|
#define rtIsSeparator(eType) (((eType) & LSM_SEPARATOR)!=0)
|
|
#define rtIsWrite(eType) (((eType) & LSM_INSERT)!=0)
|
|
#define rtIsSystem(eType) (((eType) & LSM_SYSTEMKEY)!=0)
|
|
|
|
/*
|
|
** The following macros are used to access a page footer.
|
|
*/
|
|
#define SEGMENT_NRECORD_OFFSET(pgsz) ((pgsz) - 2)
|
|
#define SEGMENT_FLAGS_OFFSET(pgsz) ((pgsz) - 2 - 2)
|
|
#define SEGMENT_POINTER_OFFSET(pgsz) ((pgsz) - 2 - 2 - 8)
|
|
#define SEGMENT_CELLPTR_OFFSET(pgsz, iCell) ((pgsz) - 2 - 2 - 8 - 2 - (iCell)*2)
|
|
|
|
#define SEGMENT_EOF(pgsz, nEntry) SEGMENT_CELLPTR_OFFSET(pgsz, nEntry-1)
|
|
|
|
#define SEGMENT_BTREE_FLAG 0x0001
|
|
#define PGFTR_SKIP_NEXT_FLAG 0x0002
|
|
#define PGFTR_SKIP_THIS_FLAG 0x0004
|
|
|
|
|
|
#ifndef LSM_SEGMENTPTR_FREE_THRESHOLD
|
|
# define LSM_SEGMENTPTR_FREE_THRESHOLD 1024
|
|
#endif
|
|
|
|
typedef struct SegmentPtr SegmentPtr;
|
|
typedef struct LsmBlob LsmBlob;
|
|
|
|
struct LsmBlob {
|
|
lsm_env *pEnv;
|
|
void *pData;
|
|
int nData;
|
|
int nAlloc;
|
|
};
|
|
|
|
/*
|
|
** A SegmentPtr object may be used for one of two purposes:
|
|
**
|
|
** * To iterate and/or seek within a single Segment (the combination of a
|
|
** main run and an optional sorted run).
|
|
**
|
|
** * To iterate through the separators array of a segment.
|
|
*/
|
|
struct SegmentPtr {
|
|
Level *pLevel; /* Level object segment is part of */
|
|
Segment *pSeg; /* Segment to access */
|
|
|
|
/* Current page. See segmentPtrLoadPage(). */
|
|
Page *pPg; /* Current page */
|
|
u16 flags; /* Copy of page flags field */
|
|
int nCell; /* Number of cells on pPg */
|
|
LsmPgno iPtr; /* Base cascade pointer */
|
|
|
|
/* Current cell. See segmentPtrLoadCell() */
|
|
int iCell; /* Current record within page pPg */
|
|
int eType; /* Type of current record */
|
|
LsmPgno iPgPtr; /* Cascade pointer offset */
|
|
void *pKey; int nKey; /* Key associated with current record */
|
|
void *pVal; int nVal; /* Current record value (eType==WRITE only) */
|
|
|
|
/* Blobs used to allocate buffers for pKey and pVal as required */
|
|
LsmBlob blob1;
|
|
LsmBlob blob2;
|
|
};
|
|
|
|
/*
|
|
** Used to iterate through the keys stored in a b-tree hierarchy from start
|
|
** to finish. Only First() and Next() operations are required.
|
|
**
|
|
** btreeCursorNew()
|
|
** btreeCursorFirst()
|
|
** btreeCursorNext()
|
|
** btreeCursorFree()
|
|
** btreeCursorPosition()
|
|
** btreeCursorRestore()
|
|
*/
|
|
typedef struct BtreePg BtreePg;
|
|
typedef struct BtreeCursor BtreeCursor;
|
|
struct BtreePg {
|
|
Page *pPage;
|
|
int iCell;
|
|
};
|
|
struct BtreeCursor {
|
|
Segment *pSeg; /* Iterate through this segments btree */
|
|
FileSystem *pFS; /* File system to read pages from */
|
|
int nDepth; /* Allocated size of aPg[] */
|
|
int iPg; /* Current entry in aPg[]. -1 -> EOF. */
|
|
BtreePg *aPg; /* Pages from root to current location */
|
|
|
|
/* Cache of current entry. pKey==0 for EOF. */
|
|
void *pKey;
|
|
int nKey;
|
|
int eType;
|
|
LsmPgno iPtr;
|
|
|
|
/* Storage for key, if not local */
|
|
LsmBlob blob;
|
|
};
|
|
|
|
|
|
/*
|
|
** A cursor used for merged searches or iterations through up to one
|
|
** Tree structure and any number of sorted files.
|
|
**
|
|
** lsmMCursorNew()
|
|
** lsmMCursorSeek()
|
|
** lsmMCursorNext()
|
|
** lsmMCursorPrev()
|
|
** lsmMCursorFirst()
|
|
** lsmMCursorLast()
|
|
** lsmMCursorKey()
|
|
** lsmMCursorValue()
|
|
** lsmMCursorValid()
|
|
**
|
|
** iFree:
|
|
** This variable is only used by cursors providing input data for a
|
|
** new top-level segment. Such cursors only ever iterate forwards, not
|
|
** backwards.
|
|
*/
|
|
struct MultiCursor {
|
|
lsm_db *pDb; /* Connection that owns this cursor */
|
|
MultiCursor *pNext; /* Next cursor owned by connection pDb */
|
|
int flags; /* Mask of CURSOR_XXX flags */
|
|
|
|
int eType; /* Cache of current key type */
|
|
LsmBlob key; /* Cache of current key (or NULL) */
|
|
LsmBlob val; /* Cache of current value */
|
|
|
|
/* All the component cursors: */
|
|
TreeCursor *apTreeCsr[2]; /* Up to two tree cursors */
|
|
int iFree; /* Next element of free-list (-ve for eof) */
|
|
SegmentPtr *aPtr; /* Array of segment pointers */
|
|
int nPtr; /* Size of array aPtr[] */
|
|
BtreeCursor *pBtCsr; /* b-tree cursor (db writes only) */
|
|
|
|
/* Comparison results */
|
|
int nTree; /* Size of aTree[] array */
|
|
int *aTree; /* Array of comparison results */
|
|
|
|
/* Used by cursors flushing the in-memory tree only */
|
|
void *pSystemVal; /* Pointer to buffer to free */
|
|
|
|
/* Used by worker cursors only */
|
|
LsmPgno *pPrevMergePtr;
|
|
};
|
|
|
|
/*
|
|
** The following constants are used to assign integers to each component
|
|
** cursor of a multi-cursor.
|
|
*/
|
|
#define CURSOR_DATA_TREE0 0 /* Current tree cursor (apTreeCsr[0]) */
|
|
#define CURSOR_DATA_TREE1 1 /* The "old" tree, if any (apTreeCsr[1]) */
|
|
#define CURSOR_DATA_SYSTEM 2 /* Free-list entries (new-toplevel only) */
|
|
#define CURSOR_DATA_SEGMENT 3 /* First segment pointer (aPtr[0]) */
|
|
|
|
/*
|
|
** CURSOR_IGNORE_DELETE
|
|
** If set, this cursor will not visit SORTED_DELETE keys.
|
|
**
|
|
** CURSOR_FLUSH_FREELIST
|
|
** This cursor is being used to create a new toplevel. It should also
|
|
** iterate through the contents of the in-memory free block list.
|
|
**
|
|
** CURSOR_IGNORE_SYSTEM
|
|
** If set, this cursor ignores system keys.
|
|
**
|
|
** CURSOR_NEXT_OK
|
|
** Set if it is Ok to call lsm_csr_next().
|
|
**
|
|
** CURSOR_PREV_OK
|
|
** Set if it is Ok to call lsm_csr_prev().
|
|
**
|
|
** CURSOR_READ_SEPARATORS
|
|
** Set if this cursor should visit the separator keys in segment
|
|
** aPtr[nPtr-1].
|
|
**
|
|
** CURSOR_SEEK_EQ
|
|
** Cursor has undergone a successful lsm_csr_seek(LSM_SEEK_EQ) operation.
|
|
** The key and value are stored in MultiCursor.key and MultiCursor.val
|
|
** respectively.
|
|
*/
|
|
#define CURSOR_IGNORE_DELETE 0x00000001
|
|
#define CURSOR_FLUSH_FREELIST 0x00000002
|
|
#define CURSOR_IGNORE_SYSTEM 0x00000010
|
|
#define CURSOR_NEXT_OK 0x00000020
|
|
#define CURSOR_PREV_OK 0x00000040
|
|
#define CURSOR_READ_SEPARATORS 0x00000080
|
|
#define CURSOR_SEEK_EQ 0x00000100
|
|
|
|
typedef struct MergeWorker MergeWorker;
|
|
typedef struct Hierarchy Hierarchy;
|
|
|
|
struct Hierarchy {
|
|
Page **apHier;
|
|
int nHier;
|
|
};
|
|
|
|
/*
|
|
** aSave:
|
|
** When mergeWorkerNextPage() is called to advance to the next page in
|
|
** the output segment, if the bStore flag for an element of aSave[] is
|
|
** true, it is cleared and the corresponding iPgno value is set to the
|
|
** page number of the page just completed.
|
|
**
|
|
** aSave[0] is used to record the pointer value to be pushed into the
|
|
** b-tree hierarchy. aSave[1] is used to save the page number of the
|
|
** page containing the indirect key most recently written to the b-tree.
|
|
** see mergeWorkerPushHierarchy() for details.
|
|
*/
|
|
struct MergeWorker {
|
|
lsm_db *pDb; /* Database handle */
|
|
Level *pLevel; /* Worker snapshot Level being merged */
|
|
MultiCursor *pCsr; /* Cursor to read new segment contents from */
|
|
int bFlush; /* True if this is an in-memory tree flush */
|
|
Hierarchy hier; /* B-tree hierarchy under construction */
|
|
Page *pPage; /* Current output page */
|
|
int nWork; /* Number of calls to mergeWorkerNextPage() */
|
|
LsmPgno *aGobble; /* Gobble point for each input segment */
|
|
|
|
LsmPgno iIndirect;
|
|
struct SavedPgno {
|
|
LsmPgno iPgno;
|
|
int bStore;
|
|
} aSave[2];
|
|
};
|
|
|
|
#ifdef LSM_DEBUG_EXPENSIVE
|
|
static int assertPointersOk(lsm_db *, Segment *, Segment *, int);
|
|
static int assertBtreeOk(lsm_db *, Segment *);
|
|
static void assertRunInOrder(lsm_db *pDb, Segment *pSeg);
|
|
#else
|
|
#define assertRunInOrder(x,y)
|
|
#define assertBtreeOk(x,y)
|
|
#endif
|
|
|
|
|
|
struct FilePage { u8 *aData; int nData; };
|
|
static u8 *fsPageData(Page *pPg, int *pnData){
|
|
*pnData = ((struct FilePage *)(pPg))->nData;
|
|
return ((struct FilePage *)(pPg))->aData;
|
|
}
|
|
/*UNUSED static u8 *fsPageDataPtr(Page *pPg){
|
|
return ((struct FilePage *)(pPg))->aData;
|
|
}*/
|
|
|
|
/*
|
|
** Write nVal as a 16-bit unsigned big-endian integer into buffer aOut.
|
|
*/
|
|
void lsmPutU16(u8 *aOut, u16 nVal){
|
|
aOut[0] = (u8)((nVal>>8) & 0xFF);
|
|
aOut[1] = (u8)(nVal & 0xFF);
|
|
}
|
|
|
|
void lsmPutU32(u8 *aOut, u32 nVal){
|
|
aOut[0] = (u8)((nVal>>24) & 0xFF);
|
|
aOut[1] = (u8)((nVal>>16) & 0xFF);
|
|
aOut[2] = (u8)((nVal>> 8) & 0xFF);
|
|
aOut[3] = (u8)((nVal ) & 0xFF);
|
|
}
|
|
|
|
int lsmGetU16(u8 *aOut){
|
|
return (aOut[0] << 8) + aOut[1];
|
|
}
|
|
|
|
u32 lsmGetU32(u8 *aOut){
|
|
return ((u32)aOut[0] << 24)
|
|
+ ((u32)aOut[1] << 16)
|
|
+ ((u32)aOut[2] << 8)
|
|
+ ((u32)aOut[3]);
|
|
}
|
|
|
|
u64 lsmGetU64(u8 *aOut){
|
|
return ((u64)aOut[0] << 56)
|
|
+ ((u64)aOut[1] << 48)
|
|
+ ((u64)aOut[2] << 40)
|
|
+ ((u64)aOut[3] << 32)
|
|
+ ((u64)aOut[4] << 24)
|
|
+ ((u32)aOut[5] << 16)
|
|
+ ((u32)aOut[6] << 8)
|
|
+ ((u32)aOut[7]);
|
|
}
|
|
|
|
void lsmPutU64(u8 *aOut, u64 nVal){
|
|
aOut[0] = (u8)((nVal>>56) & 0xFF);
|
|
aOut[1] = (u8)((nVal>>48) & 0xFF);
|
|
aOut[2] = (u8)((nVal>>40) & 0xFF);
|
|
aOut[3] = (u8)((nVal>>32) & 0xFF);
|
|
aOut[4] = (u8)((nVal>>24) & 0xFF);
|
|
aOut[5] = (u8)((nVal>>16) & 0xFF);
|
|
aOut[6] = (u8)((nVal>> 8) & 0xFF);
|
|
aOut[7] = (u8)((nVal ) & 0xFF);
|
|
}
|
|
|
|
static int sortedBlobGrow(lsm_env *pEnv, LsmBlob *pBlob, int nData){
|
|
assert( pBlob->pEnv==pEnv || (pBlob->pEnv==0 && pBlob->pData==0) );
|
|
if( pBlob->nAlloc<nData ){
|
|
pBlob->pData = lsmReallocOrFree(pEnv, pBlob->pData, nData);
|
|
if( !pBlob->pData ) return LSM_NOMEM_BKPT;
|
|
pBlob->nAlloc = nData;
|
|
pBlob->pEnv = pEnv;
|
|
}
|
|
return LSM_OK;
|
|
}
|
|
|
|
static int sortedBlobSet(lsm_env *pEnv, LsmBlob *pBlob, void *pData, int nData){
|
|
if( sortedBlobGrow(pEnv, pBlob, nData) ) return LSM_NOMEM;
|
|
memcpy(pBlob->pData, pData, nData);
|
|
pBlob->nData = nData;
|
|
return LSM_OK;
|
|
}
|
|
|
|
#if 0
|
|
static int sortedBlobCopy(LsmBlob *pDest, LsmBlob *pSrc){
|
|
return sortedBlobSet(pDest, pSrc->pData, pSrc->nData);
|
|
}
|
|
#endif
|
|
|
|
static void sortedBlobFree(LsmBlob *pBlob){
|
|
assert( pBlob->pEnv || pBlob->pData==0 );
|
|
if( pBlob->pData ) lsmFree(pBlob->pEnv, pBlob->pData);
|
|
memset(pBlob, 0, sizeof(LsmBlob));
|
|
}
|
|
|
|
static int sortedReadData(
|
|
Segment *pSeg,
|
|
Page *pPg,
|
|
int iOff,
|
|
int nByte,
|
|
void **ppData,
|
|
LsmBlob *pBlob
|
|
){
|
|
int rc = LSM_OK;
|
|
int iEnd;
|
|
int nData;
|
|
int nCell;
|
|
u8 *aData;
|
|
|
|
aData = fsPageData(pPg, &nData);
|
|
nCell = lsmGetU16(&aData[SEGMENT_NRECORD_OFFSET(nData)]);
|
|
iEnd = SEGMENT_EOF(nData, nCell);
|
|
assert( iEnd>0 && iEnd<nData );
|
|
|
|
if( iOff+nByte<=iEnd ){
|
|
*ppData = (void *)&aData[iOff];
|
|
}else{
|
|
int nRem = nByte;
|
|
int i = iOff;
|
|
u8 *aDest;
|
|
|
|
/* Make sure the blob is big enough to store the value being loaded. */
|
|
rc = sortedBlobGrow(lsmPageEnv(pPg), pBlob, nByte);
|
|
if( rc!=LSM_OK ) return rc;
|
|
pBlob->nData = nByte;
|
|
aDest = (u8 *)pBlob->pData;
|
|
*ppData = pBlob->pData;
|
|
|
|
/* Increment the pointer pages ref-count. */
|
|
lsmFsPageRef(pPg);
|
|
|
|
while( rc==LSM_OK ){
|
|
Page *pNext;
|
|
int flags;
|
|
|
|
/* Copy data from pPg into the output buffer. */
|
|
int nCopy = LSM_MIN(nRem, iEnd-i);
|
|
if( nCopy>0 ){
|
|
memcpy(&aDest[nByte-nRem], &aData[i], nCopy);
|
|
nRem -= nCopy;
|
|
i += nCopy;
|
|
assert( nRem==0 || i==iEnd );
|
|
}
|
|
assert( nRem>=0 );
|
|
if( nRem==0 ) break;
|
|
i -= iEnd;
|
|
|
|
/* Grab the next page in the segment */
|
|
|
|
do {
|
|
rc = lsmFsDbPageNext(pSeg, pPg, 1, &pNext);
|
|
if( rc==LSM_OK && pNext==0 ){
|
|
rc = LSM_CORRUPT_BKPT;
|
|
}
|
|
if( rc ) break;
|
|
lsmFsPageRelease(pPg);
|
|
pPg = pNext;
|
|
aData = fsPageData(pPg, &nData);
|
|
flags = lsmGetU16(&aData[SEGMENT_FLAGS_OFFSET(nData)]);
|
|
}while( flags&SEGMENT_BTREE_FLAG );
|
|
|
|
iEnd = SEGMENT_EOF(nData, lsmGetU16(&aData[nData-2]));
|
|
assert( iEnd>0 && iEnd<nData );
|
|
}
|
|
|
|
lsmFsPageRelease(pPg);
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
static int pageGetNRec(u8 *aData, int nData){
|
|
return (int)lsmGetU16(&aData[SEGMENT_NRECORD_OFFSET(nData)]);
|
|
}
|
|
|
|
static LsmPgno pageGetPtr(u8 *aData, int nData){
|
|
return (LsmPgno)lsmGetU64(&aData[SEGMENT_POINTER_OFFSET(nData)]);
|
|
}
|
|
|
|
static int pageGetFlags(u8 *aData, int nData){
|
|
return (int)lsmGetU16(&aData[SEGMENT_FLAGS_OFFSET(nData)]);
|
|
}
|
|
|
|
static u8 *pageGetCell(u8 *aData, int nData, int iCell){
|
|
return &aData[lsmGetU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, iCell)])];
|
|
}
|
|
|
|
/*
|
|
** Return the number of cells on page pPg.
|
|
*/
|
|
static int pageObjGetNRec(Page *pPg){
|
|
int nData;
|
|
u8 *aData = lsmFsPageData(pPg, &nData);
|
|
return pageGetNRec(aData, nData);
|
|
}
|
|
|
|
/*
|
|
** Return the decoded (possibly relative) pointer value stored in cell
|
|
** iCell from page aData/nData.
|
|
*/
|
|
static LsmPgno pageGetRecordPtr(u8 *aData, int nData, int iCell){
|
|
LsmPgno iRet; /* Return value */
|
|
u8 *aCell; /* Pointer to cell iCell */
|
|
|
|
assert( iCell<pageGetNRec(aData, nData) && iCell>=0 );
|
|
aCell = pageGetCell(aData, nData, iCell);
|
|
lsmVarintGet64(&aCell[1], &iRet);
|
|
return iRet;
|
|
}
|
|
|
|
static u8 *pageGetKey(
|
|
Segment *pSeg, /* Segment pPg belongs to */
|
|
Page *pPg, /* Page to read from */
|
|
int iCell, /* Index of cell on page to read */
|
|
int *piTopic, /* OUT: Topic associated with this key */
|
|
int *pnKey, /* OUT: Size of key in bytes */
|
|
LsmBlob *pBlob /* If required, use this for dynamic memory */
|
|
){
|
|
u8 *pKey;
|
|
i64 nDummy;
|
|
int eType;
|
|
u8 *aData;
|
|
int nData;
|
|
|
|
aData = fsPageData(pPg, &nData);
|
|
|
|
assert( !(pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG) );
|
|
assert( iCell<pageGetNRec(aData, nData) );
|
|
|
|
pKey = pageGetCell(aData, nData, iCell);
|
|
eType = *pKey++;
|
|
pKey += lsmVarintGet64(pKey, &nDummy);
|
|
pKey += lsmVarintGet32(pKey, pnKey);
|
|
if( rtIsWrite(eType) ){
|
|
pKey += lsmVarintGet64(pKey, &nDummy);
|
|
}
|
|
*piTopic = rtTopic(eType);
|
|
|
|
sortedReadData(pSeg, pPg, pKey-aData, *pnKey, (void **)&pKey, pBlob);
|
|
return pKey;
|
|
}
|
|
|
|
static int pageGetKeyCopy(
|
|
lsm_env *pEnv, /* Environment handle */
|
|
Segment *pSeg, /* Segment pPg belongs to */
|
|
Page *pPg, /* Page to read from */
|
|
int iCell, /* Index of cell on page to read */
|
|
int *piTopic, /* OUT: Topic associated with this key */
|
|
LsmBlob *pBlob /* If required, use this for dynamic memory */
|
|
){
|
|
int rc = LSM_OK;
|
|
int nKey;
|
|
u8 *aKey;
|
|
|
|
aKey = pageGetKey(pSeg, pPg, iCell, piTopic, &nKey, pBlob);
|
|
assert( (void *)aKey!=pBlob->pData || nKey==pBlob->nData );
|
|
if( (void *)aKey!=pBlob->pData ){
|
|
rc = sortedBlobSet(pEnv, pBlob, aKey, nKey);
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
static LsmPgno pageGetBtreeRef(Page *pPg, int iKey){
|
|
LsmPgno iRef;
|
|
u8 *aData;
|
|
int nData;
|
|
u8 *aCell;
|
|
|
|
aData = fsPageData(pPg, &nData);
|
|
aCell = pageGetCell(aData, nData, iKey);
|
|
assert( aCell[0]==0 );
|
|
aCell++;
|
|
aCell += lsmVarintGet64(aCell, &iRef);
|
|
lsmVarintGet64(aCell, &iRef);
|
|
assert( iRef>0 );
|
|
return iRef;
|
|
}
|
|
|
|
#define GETVARINT64(a, i) (((i)=((u8*)(a))[0])<=240?1:lsmVarintGet64((a), &(i)))
|
|
#define GETVARINT32(a, i) (((i)=((u8*)(a))[0])<=240?1:lsmVarintGet32((a), &(i)))
|
|
|
|
static int pageGetBtreeKey(
|
|
Segment *pSeg, /* Segment page pPg belongs to */
|
|
Page *pPg,
|
|
int iKey,
|
|
LsmPgno *piPtr,
|
|
int *piTopic,
|
|
void **ppKey,
|
|
int *pnKey,
|
|
LsmBlob *pBlob
|
|
){
|
|
u8 *aData;
|
|
int nData;
|
|
u8 *aCell;
|
|
int eType;
|
|
|
|
aData = fsPageData(pPg, &nData);
|
|
assert( SEGMENT_BTREE_FLAG & pageGetFlags(aData, nData) );
|
|
assert( iKey>=0 && iKey<pageGetNRec(aData, nData) );
|
|
|
|
aCell = pageGetCell(aData, nData, iKey);
|
|
eType = *aCell++;
|
|
aCell += GETVARINT64(aCell, *piPtr);
|
|
|
|
if( eType==0 ){
|
|
int rc;
|
|
LsmPgno iRef; /* Page number of referenced page */
|
|
Page *pRef;
|
|
aCell += GETVARINT64(aCell, iRef);
|
|
rc = lsmFsDbPageGet(lsmPageFS(pPg), pSeg, iRef, &pRef);
|
|
if( rc!=LSM_OK ) return rc;
|
|
pageGetKeyCopy(lsmPageEnv(pPg), pSeg, pRef, 0, &eType, pBlob);
|
|
lsmFsPageRelease(pRef);
|
|
*ppKey = pBlob->pData;
|
|
*pnKey = pBlob->nData;
|
|
}else{
|
|
aCell += GETVARINT32(aCell, *pnKey);
|
|
*ppKey = aCell;
|
|
}
|
|
if( piTopic ) *piTopic = rtTopic(eType);
|
|
|
|
return LSM_OK;
|
|
}
|
|
|
|
static int btreeCursorLoadKey(BtreeCursor *pCsr){
|
|
int rc = LSM_OK;
|
|
if( pCsr->iPg<0 ){
|
|
pCsr->pKey = 0;
|
|
pCsr->nKey = 0;
|
|
pCsr->eType = 0;
|
|
}else{
|
|
LsmPgno dummy;
|
|
int iPg = pCsr->iPg;
|
|
int iCell = pCsr->aPg[iPg].iCell;
|
|
while( iCell<0 && (--iPg)>=0 ){
|
|
iCell = pCsr->aPg[iPg].iCell-1;
|
|
}
|
|
if( iPg<0 || iCell<0 ) return LSM_CORRUPT_BKPT;
|
|
|
|
rc = pageGetBtreeKey(
|
|
pCsr->pSeg,
|
|
pCsr->aPg[iPg].pPage, iCell,
|
|
&dummy, &pCsr->eType, &pCsr->pKey, &pCsr->nKey, &pCsr->blob
|
|
);
|
|
pCsr->eType |= LSM_SEPARATOR;
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
static LsmPgno btreeCursorPtr(u8 *aData, int nData, int iCell){
|
|
int nCell;
|
|
|
|
nCell = pageGetNRec(aData, nData);
|
|
if( iCell>=nCell ){
|
|
return pageGetPtr(aData, nData);
|
|
}
|
|
return pageGetRecordPtr(aData, nData, iCell);
|
|
}
|
|
|
|
static int btreeCursorNext(BtreeCursor *pCsr){
|
|
int rc = LSM_OK;
|
|
|
|
BtreePg *pPg = &pCsr->aPg[pCsr->iPg];
|
|
int nCell;
|
|
u8 *aData;
|
|
int nData;
|
|
|
|
assert( pCsr->iPg>=0 );
|
|
assert( pCsr->iPg==pCsr->nDepth-1 );
|
|
|
|
aData = fsPageData(pPg->pPage, &nData);
|
|
nCell = pageGetNRec(aData, nData);
|
|
assert( pPg->iCell<=nCell );
|
|
pPg->iCell++;
|
|
if( pPg->iCell==nCell ){
|
|
LsmPgno iLoad;
|
|
|
|
/* Up to parent. */
|
|
lsmFsPageRelease(pPg->pPage);
|
|
pPg->pPage = 0;
|
|
pCsr->iPg--;
|
|
while( pCsr->iPg>=0 ){
|
|
pPg = &pCsr->aPg[pCsr->iPg];
|
|
aData = fsPageData(pPg->pPage, &nData);
|
|
if( pPg->iCell<pageGetNRec(aData, nData) ) break;
|
|
lsmFsPageRelease(pPg->pPage);
|
|
pCsr->iPg--;
|
|
}
|
|
|
|
/* Read the key */
|
|
rc = btreeCursorLoadKey(pCsr);
|
|
|
|
/* Unless the cursor is at EOF, descend to cell -1 (yes, negative one) of
|
|
** the left-most most descendent. */
|
|
if( pCsr->iPg>=0 ){
|
|
pCsr->aPg[pCsr->iPg].iCell++;
|
|
|
|
iLoad = btreeCursorPtr(aData, nData, pPg->iCell);
|
|
do {
|
|
Page *pLoad;
|
|
pCsr->iPg++;
|
|
rc = lsmFsDbPageGet(pCsr->pFS, pCsr->pSeg, iLoad, &pLoad);
|
|
pCsr->aPg[pCsr->iPg].pPage = pLoad;
|
|
pCsr->aPg[pCsr->iPg].iCell = 0;
|
|
if( rc==LSM_OK ){
|
|
if( pCsr->iPg==(pCsr->nDepth-1) ) break;
|
|
aData = fsPageData(pLoad, &nData);
|
|
iLoad = btreeCursorPtr(aData, nData, 0);
|
|
}
|
|
}while( rc==LSM_OK && pCsr->iPg<(pCsr->nDepth-1) );
|
|
pCsr->aPg[pCsr->iPg].iCell = -1;
|
|
}
|
|
|
|
}else{
|
|
rc = btreeCursorLoadKey(pCsr);
|
|
}
|
|
|
|
if( rc==LSM_OK && pCsr->iPg>=0 ){
|
|
aData = fsPageData(pCsr->aPg[pCsr->iPg].pPage, &nData);
|
|
pCsr->iPtr = btreeCursorPtr(aData, nData, pCsr->aPg[pCsr->iPg].iCell+1);
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
static void btreeCursorFree(BtreeCursor *pCsr){
|
|
if( pCsr ){
|
|
int i;
|
|
lsm_env *pEnv = lsmFsEnv(pCsr->pFS);
|
|
for(i=0; i<=pCsr->iPg; i++){
|
|
lsmFsPageRelease(pCsr->aPg[i].pPage);
|
|
}
|
|
sortedBlobFree(&pCsr->blob);
|
|
lsmFree(pEnv, pCsr->aPg);
|
|
lsmFree(pEnv, pCsr);
|
|
}
|
|
}
|
|
|
|
static int btreeCursorFirst(BtreeCursor *pCsr){
|
|
int rc;
|
|
|
|
Page *pPg = 0;
|
|
FileSystem *pFS = pCsr->pFS;
|
|
LsmPgno iPg = pCsr->pSeg->iRoot;
|
|
|
|
do {
|
|
rc = lsmFsDbPageGet(pFS, pCsr->pSeg, iPg, &pPg);
|
|
assert( (rc==LSM_OK)==(pPg!=0) );
|
|
if( rc==LSM_OK ){
|
|
u8 *aData;
|
|
int nData;
|
|
int flags;
|
|
|
|
aData = fsPageData(pPg, &nData);
|
|
flags = pageGetFlags(aData, nData);
|
|
if( (flags & SEGMENT_BTREE_FLAG)==0 ) break;
|
|
|
|
if( (pCsr->nDepth % 8)==0 ){
|
|
int nNew = pCsr->nDepth + 8;
|
|
pCsr->aPg = (BtreePg *)lsmReallocOrFreeRc(
|
|
lsmFsEnv(pFS), pCsr->aPg, sizeof(BtreePg) * nNew, &rc
|
|
);
|
|
if( rc==LSM_OK ){
|
|
memset(&pCsr->aPg[pCsr->nDepth], 0, sizeof(BtreePg) * 8);
|
|
}
|
|
}
|
|
|
|
if( rc==LSM_OK ){
|
|
assert( pCsr->aPg[pCsr->nDepth].iCell==0 );
|
|
pCsr->aPg[pCsr->nDepth].pPage = pPg;
|
|
pCsr->nDepth++;
|
|
iPg = pageGetRecordPtr(aData, nData, 0);
|
|
}
|
|
}
|
|
}while( rc==LSM_OK );
|
|
lsmFsPageRelease(pPg);
|
|
pCsr->iPg = pCsr->nDepth-1;
|
|
|
|
if( rc==LSM_OK && pCsr->nDepth ){
|
|
pCsr->aPg[pCsr->iPg].iCell = -1;
|
|
rc = btreeCursorNext(pCsr);
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
static void btreeCursorPosition(BtreeCursor *pCsr, MergeInput *p){
|
|
if( pCsr->iPg>=0 ){
|
|
p->iPg = lsmFsPageNumber(pCsr->aPg[pCsr->iPg].pPage);
|
|
p->iCell = ((pCsr->aPg[pCsr->iPg].iCell + 1) << 8) + pCsr->nDepth;
|
|
}else{
|
|
p->iPg = 0;
|
|
p->iCell = 0;
|
|
}
|
|
}
|
|
|
|
static void btreeCursorSplitkey(BtreeCursor *pCsr, MergeInput *p){
|
|
int iCell = pCsr->aPg[pCsr->iPg].iCell;
|
|
if( iCell>=0 ){
|
|
p->iCell = iCell;
|
|
p->iPg = lsmFsPageNumber(pCsr->aPg[pCsr->iPg].pPage);
|
|
}else{
|
|
int i;
|
|
for(i=pCsr->iPg-1; i>=0; i--){
|
|
if( pCsr->aPg[i].iCell>0 ) break;
|
|
}
|
|
assert( i>=0 );
|
|
p->iCell = pCsr->aPg[i].iCell-1;
|
|
p->iPg = lsmFsPageNumber(pCsr->aPg[i].pPage);
|
|
}
|
|
}
|
|
|
|
static int sortedKeyCompare(
|
|
int (*xCmp)(void *, int, void *, int),
|
|
int iLhsTopic, void *pLhsKey, int nLhsKey,
|
|
int iRhsTopic, void *pRhsKey, int nRhsKey
|
|
){
|
|
int res = iLhsTopic - iRhsTopic;
|
|
if( res==0 ){
|
|
res = xCmp(pLhsKey, nLhsKey, pRhsKey, nRhsKey);
|
|
}
|
|
return res;
|
|
}
|
|
|
|
static int btreeCursorRestore(
|
|
BtreeCursor *pCsr,
|
|
int (*xCmp)(void *, int, void *, int),
|
|
MergeInput *p
|
|
){
|
|
int rc = LSM_OK;
|
|
|
|
if( p->iPg ){
|
|
lsm_env *pEnv = lsmFsEnv(pCsr->pFS);
|
|
int iCell; /* Current cell number on leaf page */
|
|
LsmPgno iLeaf; /* Page number of current leaf page */
|
|
int nDepth; /* Depth of b-tree structure */
|
|
Segment *pSeg = pCsr->pSeg;
|
|
|
|
/* Decode the MergeInput structure */
|
|
iLeaf = p->iPg;
|
|
nDepth = (p->iCell & 0x00FF);
|
|
iCell = (p->iCell >> 8) - 1;
|
|
|
|
/* Allocate the BtreeCursor.aPg[] array */
|
|
assert( pCsr->aPg==0 );
|
|
pCsr->aPg = (BtreePg *)lsmMallocZeroRc(pEnv, sizeof(BtreePg) * nDepth, &rc);
|
|
|
|
/* Populate the last entry of the aPg[] array */
|
|
if( rc==LSM_OK ){
|
|
Page **pp = &pCsr->aPg[nDepth-1].pPage;
|
|
pCsr->iPg = nDepth-1;
|
|
pCsr->nDepth = nDepth;
|
|
pCsr->aPg[pCsr->iPg].iCell = iCell;
|
|
rc = lsmFsDbPageGet(pCsr->pFS, pSeg, iLeaf, pp);
|
|
}
|
|
|
|
/* Populate any other aPg[] array entries */
|
|
if( rc==LSM_OK && nDepth>1 ){
|
|
LsmBlob blob = {0,0,0};
|
|
void *pSeek;
|
|
int nSeek;
|
|
int iTopicSeek;
|
|
int iPg = 0;
|
|
LsmPgno iLoad = pSeg->iRoot;
|
|
Page *pPg = pCsr->aPg[nDepth-1].pPage;
|
|
|
|
if( pageObjGetNRec(pPg)==0 ){
|
|
/* This can happen when pPg is the right-most leaf in the b-tree.
|
|
** In this case, set the iTopicSeek/pSeek/nSeek key to a value
|
|
** greater than any real key. */
|
|
assert( iCell==-1 );
|
|
iTopicSeek = 1000;
|
|
pSeek = 0;
|
|
nSeek = 0;
|
|
}else{
|
|
LsmPgno dummy;
|
|
rc = pageGetBtreeKey(pSeg, pPg,
|
|
0, &dummy, &iTopicSeek, &pSeek, &nSeek, &pCsr->blob
|
|
);
|
|
}
|
|
|
|
do {
|
|
Page *pPg2;
|
|
rc = lsmFsDbPageGet(pCsr->pFS, pSeg, iLoad, &pPg2);
|
|
assert( rc==LSM_OK || pPg2==0 );
|
|
if( rc==LSM_OK ){
|
|
u8 *aData; /* Buffer containing page data */
|
|
int nData; /* Size of aData[] in bytes */
|
|
int iMin;
|
|
int iMax;
|
|
int iCell2;
|
|
|
|
aData = fsPageData(pPg2, &nData);
|
|
assert( (pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG) );
|
|
|
|
iLoad = pageGetPtr(aData, nData);
|
|
iCell2 = pageGetNRec(aData, nData);
|
|
iMax = iCell2-1;
|
|
iMin = 0;
|
|
|
|
while( iMax>=iMin ){
|
|
int iTry = (iMin+iMax)/2;
|
|
void *pKey; int nKey; /* Key for cell iTry */
|
|
int iTopic; /* Topic for key pKeyT/nKeyT */
|
|
LsmPgno iPtr; /* Pointer for cell iTry */
|
|
int res; /* (pSeek - pKeyT) */
|
|
|
|
rc = pageGetBtreeKey(
|
|
pSeg, pPg2, iTry, &iPtr, &iTopic, &pKey, &nKey, &blob
|
|
);
|
|
if( rc!=LSM_OK ) break;
|
|
|
|
res = sortedKeyCompare(
|
|
xCmp, iTopicSeek, pSeek, nSeek, iTopic, pKey, nKey
|
|
);
|
|
assert( res!=0 );
|
|
|
|
if( res<0 ){
|
|
iLoad = iPtr;
|
|
iCell2 = iTry;
|
|
iMax = iTry-1;
|
|
}else{
|
|
iMin = iTry+1;
|
|
}
|
|
}
|
|
|
|
pCsr->aPg[iPg].pPage = pPg2;
|
|
pCsr->aPg[iPg].iCell = iCell2;
|
|
iPg++;
|
|
assert( iPg!=nDepth-1
|
|
|| lsmFsRedirectPage(pCsr->pFS, pSeg->pRedirect, iLoad)==iLeaf
|
|
);
|
|
}
|
|
}while( rc==LSM_OK && iPg<(nDepth-1) );
|
|
sortedBlobFree(&blob);
|
|
}
|
|
|
|
/* Load the current key and pointer */
|
|
if( rc==LSM_OK ){
|
|
BtreePg *pBtreePg;
|
|
u8 *aData;
|
|
int nData;
|
|
|
|
pBtreePg = &pCsr->aPg[pCsr->iPg];
|
|
aData = fsPageData(pBtreePg->pPage, &nData);
|
|
pCsr->iPtr = btreeCursorPtr(aData, nData, pBtreePg->iCell+1);
|
|
if( pBtreePg->iCell<0 ){
|
|
LsmPgno dummy;
|
|
int i;
|
|
for(i=pCsr->iPg-1; i>=0; i--){
|
|
if( pCsr->aPg[i].iCell>0 ) break;
|
|
}
|
|
assert( i>=0 );
|
|
rc = pageGetBtreeKey(pSeg,
|
|
pCsr->aPg[i].pPage, pCsr->aPg[i].iCell-1,
|
|
&dummy, &pCsr->eType, &pCsr->pKey, &pCsr->nKey, &pCsr->blob
|
|
);
|
|
pCsr->eType |= LSM_SEPARATOR;
|
|
|
|
}else{
|
|
rc = btreeCursorLoadKey(pCsr);
|
|
}
|
|
}
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
static int btreeCursorNew(
|
|
lsm_db *pDb,
|
|
Segment *pSeg,
|
|
BtreeCursor **ppCsr
|
|
){
|
|
int rc = LSM_OK;
|
|
BtreeCursor *pCsr;
|
|
|
|
assert( pSeg->iRoot );
|
|
pCsr = lsmMallocZeroRc(pDb->pEnv, sizeof(BtreeCursor), &rc);
|
|
if( pCsr ){
|
|
pCsr->pFS = pDb->pFS;
|
|
pCsr->pSeg = pSeg;
|
|
pCsr->iPg = -1;
|
|
}
|
|
|
|
*ppCsr = pCsr;
|
|
return rc;
|
|
}
|
|
|
|
static void segmentPtrSetPage(SegmentPtr *pPtr, Page *pNext){
|
|
lsmFsPageRelease(pPtr->pPg);
|
|
if( pNext ){
|
|
int nData;
|
|
u8 *aData = fsPageData(pNext, &nData);
|
|
pPtr->nCell = pageGetNRec(aData, nData);
|
|
pPtr->flags = (u16)pageGetFlags(aData, nData);
|
|
pPtr->iPtr = pageGetPtr(aData, nData);
|
|
}
|
|
pPtr->pPg = pNext;
|
|
}
|
|
|
|
/*
|
|
** Load a new page into the SegmentPtr object pPtr.
|
|
*/
|
|
static int segmentPtrLoadPage(
|
|
FileSystem *pFS,
|
|
SegmentPtr *pPtr, /* Load page into this SegmentPtr object */
|
|
LsmPgno iNew /* Page number of new page */
|
|
){
|
|
Page *pPg = 0; /* The new page */
|
|
int rc; /* Return Code */
|
|
|
|
rc = lsmFsDbPageGet(pFS, pPtr->pSeg, iNew, &pPg);
|
|
assert( rc==LSM_OK || pPg==0 );
|
|
segmentPtrSetPage(pPtr, pPg);
|
|
|
|
return rc;
|
|
}
|
|
|
|
static int segmentPtrReadData(
|
|
SegmentPtr *pPtr,
|
|
int iOff,
|
|
int nByte,
|
|
void **ppData,
|
|
LsmBlob *pBlob
|
|
){
|
|
return sortedReadData(pPtr->pSeg, pPtr->pPg, iOff, nByte, ppData, pBlob);
|
|
}
|
|
|
|
static int segmentPtrNextPage(
|
|
SegmentPtr *pPtr, /* Load page into this SegmentPtr object */
|
|
int eDir /* +1 for next(), -1 for prev() */
|
|
){
|
|
Page *pNext; /* New page to load */
|
|
int rc; /* Return code */
|
|
|
|
assert( eDir==1 || eDir==-1 );
|
|
assert( pPtr->pPg );
|
|
assert( pPtr->pSeg || eDir>0 );
|
|
|
|
rc = lsmFsDbPageNext(pPtr->pSeg, pPtr->pPg, eDir, &pNext);
|
|
assert( rc==LSM_OK || pNext==0 );
|
|
segmentPtrSetPage(pPtr, pNext);
|
|
return rc;
|
|
}
|
|
|
|
static int segmentPtrLoadCell(
|
|
SegmentPtr *pPtr, /* Load page into this SegmentPtr object */
|
|
int iNew /* Cell number of new cell */
|
|
){
|
|
int rc = LSM_OK;
|
|
if( pPtr->pPg ){
|
|
u8 *aData; /* Pointer to page data buffer */
|
|
int iOff; /* Offset in aData[] to read from */
|
|
int nPgsz; /* Size of page (aData[]) in bytes */
|
|
|
|
assert( iNew<pPtr->nCell );
|
|
pPtr->iCell = iNew;
|
|
aData = fsPageData(pPtr->pPg, &nPgsz);
|
|
iOff = lsmGetU16(&aData[SEGMENT_CELLPTR_OFFSET(nPgsz, pPtr->iCell)]);
|
|
pPtr->eType = aData[iOff];
|
|
iOff++;
|
|
iOff += GETVARINT64(&aData[iOff], pPtr->iPgPtr);
|
|
iOff += GETVARINT32(&aData[iOff], pPtr->nKey);
|
|
if( rtIsWrite(pPtr->eType) ){
|
|
iOff += GETVARINT32(&aData[iOff], pPtr->nVal);
|
|
}
|
|
assert( pPtr->nKey>=0 );
|
|
|
|
rc = segmentPtrReadData(
|
|
pPtr, iOff, pPtr->nKey, &pPtr->pKey, &pPtr->blob1
|
|
);
|
|
if( rc==LSM_OK && rtIsWrite(pPtr->eType) ){
|
|
rc = segmentPtrReadData(
|
|
pPtr, iOff+pPtr->nKey, pPtr->nVal, &pPtr->pVal, &pPtr->blob2
|
|
);
|
|
}else{
|
|
pPtr->nVal = 0;
|
|
pPtr->pVal = 0;
|
|
}
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
|
|
static Segment *sortedSplitkeySegment(Level *pLevel){
|
|
Merge *pMerge = pLevel->pMerge;
|
|
MergeInput *p = &pMerge->splitkey;
|
|
Segment *pSeg;
|
|
int i;
|
|
|
|
for(i=0; i<pMerge->nInput; i++){
|
|
if( p->iPg==pMerge->aInput[i].iPg ) break;
|
|
}
|
|
if( pMerge->nInput==(pLevel->nRight+1) && i>=(pMerge->nInput-1) ){
|
|
pSeg = &pLevel->pNext->lhs;
|
|
}else{
|
|
pSeg = &pLevel->aRhs[i];
|
|
}
|
|
|
|
return pSeg;
|
|
}
|
|
|
|
static void sortedSplitkey(lsm_db *pDb, Level *pLevel, int *pRc){
|
|
Segment *pSeg;
|
|
Page *pPg = 0;
|
|
lsm_env *pEnv = pDb->pEnv; /* Environment handle */
|
|
int rc = *pRc;
|
|
Merge *pMerge = pLevel->pMerge;
|
|
|
|
pSeg = sortedSplitkeySegment(pLevel);
|
|
if( rc==LSM_OK ){
|
|
rc = lsmFsDbPageGet(pDb->pFS, pSeg, pMerge->splitkey.iPg, &pPg);
|
|
}
|
|
if( rc==LSM_OK ){
|
|
int iTopic;
|
|
LsmBlob blob = {0, 0, 0, 0};
|
|
u8 *aData;
|
|
int nData;
|
|
|
|
aData = lsmFsPageData(pPg, &nData);
|
|
if( pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG ){
|
|
void *pKey;
|
|
int nKey;
|
|
LsmPgno dummy;
|
|
rc = pageGetBtreeKey(pSeg,
|
|
pPg, pMerge->splitkey.iCell, &dummy, &iTopic, &pKey, &nKey, &blob
|
|
);
|
|
if( rc==LSM_OK && blob.pData!=pKey ){
|
|
rc = sortedBlobSet(pEnv, &blob, pKey, nKey);
|
|
}
|
|
}else{
|
|
rc = pageGetKeyCopy(
|
|
pEnv, pSeg, pPg, pMerge->splitkey.iCell, &iTopic, &blob
|
|
);
|
|
}
|
|
|
|
pLevel->iSplitTopic = iTopic;
|
|
pLevel->pSplitKey = blob.pData;
|
|
pLevel->nSplitKey = blob.nData;
|
|
lsmFsPageRelease(pPg);
|
|
}
|
|
|
|
*pRc = rc;
|
|
}
|
|
|
|
/*
|
|
** Reset a segment cursor. Also free its buffers if they are nThreshold
|
|
** bytes or larger in size.
|
|
*/
|
|
static void segmentPtrReset(SegmentPtr *pPtr, int nThreshold){
|
|
lsmFsPageRelease(pPtr->pPg);
|
|
pPtr->pPg = 0;
|
|
pPtr->nCell = 0;
|
|
pPtr->pKey = 0;
|
|
pPtr->nKey = 0;
|
|
pPtr->pVal = 0;
|
|
pPtr->nVal = 0;
|
|
pPtr->eType = 0;
|
|
pPtr->iCell = 0;
|
|
if( pPtr->blob1.nAlloc>=nThreshold ) sortedBlobFree(&pPtr->blob1);
|
|
if( pPtr->blob2.nAlloc>=nThreshold ) sortedBlobFree(&pPtr->blob2);
|
|
}
|
|
|
|
static int segmentPtrIgnoreSeparators(MultiCursor *pCsr, SegmentPtr *pPtr){
|
|
return (pCsr->flags & CURSOR_READ_SEPARATORS)==0
|
|
|| (pPtr!=&pCsr->aPtr[pCsr->nPtr-1]);
|
|
}
|
|
|
|
static int segmentPtrAdvance(
|
|
MultiCursor *pCsr,
|
|
SegmentPtr *pPtr,
|
|
int bReverse
|
|
){
|
|
int eDir = (bReverse ? -1 : 1);
|
|
Level *pLvl = pPtr->pLevel;
|
|
do {
|
|
int rc;
|
|
int iCell; /* Number of new cell in page */
|
|
int svFlags = 0; /* SegmentPtr.eType before advance */
|
|
|
|
iCell = pPtr->iCell + eDir;
|
|
assert( pPtr->pPg );
|
|
assert( iCell<=pPtr->nCell && iCell>=-1 );
|
|
|
|
if( bReverse && pPtr->pSeg!=&pPtr->pLevel->lhs ){
|
|
svFlags = pPtr->eType;
|
|
assert( svFlags );
|
|
}
|
|
|
|
if( iCell>=pPtr->nCell || iCell<0 ){
|
|
do {
|
|
rc = segmentPtrNextPage(pPtr, eDir);
|
|
}while( rc==LSM_OK
|
|
&& pPtr->pPg
|
|
&& (pPtr->nCell==0 || (pPtr->flags & SEGMENT_BTREE_FLAG) )
|
|
);
|
|
if( rc!=LSM_OK ) return rc;
|
|
iCell = bReverse ? (pPtr->nCell-1) : 0;
|
|
}
|
|
rc = segmentPtrLoadCell(pPtr, iCell);
|
|
if( rc!=LSM_OK ) return rc;
|
|
|
|
if( svFlags && pPtr->pPg ){
|
|
int res = sortedKeyCompare(pCsr->pDb->xCmp,
|
|
rtTopic(pPtr->eType), pPtr->pKey, pPtr->nKey,
|
|
pLvl->iSplitTopic, pLvl->pSplitKey, pLvl->nSplitKey
|
|
);
|
|
if( res<0 ) segmentPtrReset(pPtr, LSM_SEGMENTPTR_FREE_THRESHOLD);
|
|
}
|
|
|
|
if( pPtr->pPg==0 && (svFlags & LSM_END_DELETE) ){
|
|
Segment *pSeg = pPtr->pSeg;
|
|
rc = lsmFsDbPageGet(pCsr->pDb->pFS, pSeg, pSeg->iFirst, &pPtr->pPg);
|
|
if( rc!=LSM_OK ) return rc;
|
|
pPtr->eType = LSM_START_DELETE | LSM_POINT_DELETE;
|
|
pPtr->eType |= (pLvl->iSplitTopic ? LSM_SYSTEMKEY : 0);
|
|
pPtr->pKey = pLvl->pSplitKey;
|
|
pPtr->nKey = pLvl->nSplitKey;
|
|
}
|
|
|
|
}while( pCsr
|
|
&& pPtr->pPg
|
|
&& segmentPtrIgnoreSeparators(pCsr, pPtr)
|
|
&& rtIsSeparator(pPtr->eType)
|
|
);
|
|
|
|
return LSM_OK;
|
|
}
|
|
|
|
static void segmentPtrEndPage(
|
|
FileSystem *pFS,
|
|
SegmentPtr *pPtr,
|
|
int bLast,
|
|
int *pRc
|
|
){
|
|
if( *pRc==LSM_OK ){
|
|
Segment *pSeg = pPtr->pSeg;
|
|
Page *pNew = 0;
|
|
if( bLast ){
|
|
*pRc = lsmFsDbPageLast(pFS, pSeg, &pNew);
|
|
}else{
|
|
*pRc = lsmFsDbPageGet(pFS, pSeg, pSeg->iFirst, &pNew);
|
|
}
|
|
segmentPtrSetPage(pPtr, pNew);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
** Try to move the segment pointer passed as the second argument so that it
|
|
** points at either the first (bLast==0) or last (bLast==1) cell in the valid
|
|
** region of the segment defined by pPtr->iFirst and pPtr->iLast.
|
|
**
|
|
** Return LSM_OK if successful or an lsm error code if something goes
|
|
** wrong (IO error, OOM etc.).
|
|
*/
|
|
static int segmentPtrEnd(MultiCursor *pCsr, SegmentPtr *pPtr, int bLast){
|
|
Level *pLvl = pPtr->pLevel;
|
|
int rc = LSM_OK;
|
|
FileSystem *pFS = pCsr->pDb->pFS;
|
|
int bIgnore;
|
|
|
|
segmentPtrEndPage(pFS, pPtr, bLast, &rc);
|
|
while( rc==LSM_OK && pPtr->pPg
|
|
&& (pPtr->nCell==0 || (pPtr->flags & SEGMENT_BTREE_FLAG))
|
|
){
|
|
rc = segmentPtrNextPage(pPtr, (bLast ? -1 : 1));
|
|
}
|
|
|
|
if( rc==LSM_OK && pPtr->pPg ){
|
|
rc = segmentPtrLoadCell(pPtr, bLast ? (pPtr->nCell-1) : 0);
|
|
if( rc==LSM_OK && bLast && pPtr->pSeg!=&pLvl->lhs ){
|
|
int res = sortedKeyCompare(pCsr->pDb->xCmp,
|
|
rtTopic(pPtr->eType), pPtr->pKey, pPtr->nKey,
|
|
pLvl->iSplitTopic, pLvl->pSplitKey, pLvl->nSplitKey
|
|
);
|
|
if( res<0 ) segmentPtrReset(pPtr, LSM_SEGMENTPTR_FREE_THRESHOLD);
|
|
}
|
|
}
|
|
|
|
bIgnore = segmentPtrIgnoreSeparators(pCsr, pPtr);
|
|
if( rc==LSM_OK && pPtr->pPg && bIgnore && rtIsSeparator(pPtr->eType) ){
|
|
rc = segmentPtrAdvance(pCsr, pPtr, bLast);
|
|
}
|
|
|
|
#if 0
|
|
if( bLast && rc==LSM_OK && pPtr->pPg
|
|
&& pPtr->pSeg==&pLvl->lhs
|
|
&& pLvl->nRight && (pPtr->eType & LSM_START_DELETE)
|
|
){
|
|
pPtr->iCell++;
|
|
pPtr->eType = LSM_END_DELETE | (pLvl->iSplitTopic);
|
|
pPtr->pKey = pLvl->pSplitKey;
|
|
pPtr->nKey = pLvl->nSplitKey;
|
|
pPtr->pVal = 0;
|
|
pPtr->nVal = 0;
|
|
}
|
|
#endif
|
|
|
|
return rc;
|
|
}
|
|
|
|
static void segmentPtrKey(SegmentPtr *pPtr, void **ppKey, int *pnKey){
|
|
assert( pPtr->pPg );
|
|
*ppKey = pPtr->pKey;
|
|
*pnKey = pPtr->nKey;
|
|
}
|
|
|
|
#if 0 /* NOT USED */
|
|
static char *keyToString(lsm_env *pEnv, void *pKey, int nKey){
|
|
int i;
|
|
u8 *aKey = (u8 *)pKey;
|
|
char *zRet = (char *)lsmMalloc(pEnv, nKey+1);
|
|
|
|
for(i=0; i<nKey; i++){
|
|
zRet[i] = (char)(isalnum(aKey[i]) ? aKey[i] : '.');
|
|
}
|
|
zRet[nKey] = '\0';
|
|
return zRet;
|
|
}
|
|
#endif
|
|
|
|
#if 0 /* NOT USED */
|
|
/*
|
|
** Check that the page that pPtr currently has loaded is the correct page
|
|
** to search for key (pKey/nKey). If it is, return 1. Otherwise, an assert
|
|
** fails and this function does not return.
|
|
*/
|
|
static int assertKeyLocation(
|
|
MultiCursor *pCsr,
|
|
SegmentPtr *pPtr,
|
|
void *pKey, int nKey
|
|
){
|
|
lsm_env *pEnv = lsmFsEnv(pCsr->pDb->pFS);
|
|
LsmBlob blob = {0, 0, 0};
|
|
int eDir;
|
|
int iTopic = 0; /* TODO: Fix me */
|
|
|
|
for(eDir=-1; eDir<=1; eDir+=2){
|
|
Page *pTest = pPtr->pPg;
|
|
|
|
lsmFsPageRef(pTest);
|
|
while( pTest ){
|
|
Segment *pSeg = pPtr->pSeg;
|
|
Page *pNext;
|
|
|
|
int rc = lsmFsDbPageNext(pSeg, pTest, eDir, &pNext);
|
|
lsmFsPageRelease(pTest);
|
|
if( rc ) return 1;
|
|
pTest = pNext;
|
|
|
|
if( pTest ){
|
|
int nData;
|
|
u8 *aData = fsPageData(pTest, &nData);
|
|
int nCell = pageGetNRec(aData, nData);
|
|
int flags = pageGetFlags(aData, nData);
|
|
if( nCell && 0==(flags&SEGMENT_BTREE_FLAG) ){
|
|
int nPgKey;
|
|
int iPgTopic;
|
|
u8 *pPgKey;
|
|
int res;
|
|
int iCell;
|
|
|
|
iCell = ((eDir < 0) ? (nCell-1) : 0);
|
|
pPgKey = pageGetKey(pSeg, pTest, iCell, &iPgTopic, &nPgKey, &blob);
|
|
res = iTopic - iPgTopic;
|
|
if( res==0 ) res = pCsr->pDb->xCmp(pKey, nKey, pPgKey, nPgKey);
|
|
if( (eDir==1 && res>0) || (eDir==-1 && res<0) ){
|
|
/* Taking this branch means something has gone wrong. */
|
|
char *zMsg = lsmMallocPrintf(pEnv, "Key \"%s\" is not on page %d",
|
|
keyToString(pEnv, pKey, nKey), lsmFsPageNumber(pPtr->pPg)
|
|
);
|
|
fprintf(stderr, "%s\n", zMsg);
|
|
assert( !"assertKeyLocation() failed" );
|
|
}
|
|
lsmFsPageRelease(pTest);
|
|
pTest = 0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
sortedBlobFree(&blob);
|
|
return 1;
|
|
}
|
|
#endif
|
|
|
|
#ifndef NDEBUG
|
|
static int assertSeekResult(
|
|
MultiCursor *pCsr,
|
|
SegmentPtr *pPtr,
|
|
int iTopic,
|
|
void *pKey,
|
|
int nKey,
|
|
int eSeek
|
|
){
|
|
if( pPtr->pPg ){
|
|
int res;
|
|
res = sortedKeyCompare(pCsr->pDb->xCmp, iTopic, pKey, nKey,
|
|
rtTopic(pPtr->eType), pPtr->pKey, pPtr->nKey
|
|
);
|
|
|
|
if( eSeek==LSM_SEEK_EQ ) return (res==0);
|
|
if( eSeek==LSM_SEEK_LE ) return (res>=0);
|
|
if( eSeek==LSM_SEEK_GE ) return (res<=0);
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
#endif
|
|
|
|
static int segmentPtrSearchOversized(
|
|
MultiCursor *pCsr, /* Cursor context */
|
|
SegmentPtr *pPtr, /* Pointer to seek */
|
|
int iTopic, /* Topic of key to search for */
|
|
void *pKey, int nKey /* Key to seek to */
|
|
){
|
|
int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp;
|
|
int rc = LSM_OK;
|
|
|
|
/* If the OVERSIZED flag is set, then there is no pointer in the
|
|
** upper level to the next page in the segment that contains at least
|
|
** one key. So compare the largest key on the current page with the
|
|
** key being sought (pKey/nKey). If (pKey/nKey) is larger, advance
|
|
** to the next page in the segment that contains at least one key.
|
|
*/
|
|
while( rc==LSM_OK && (pPtr->flags & PGFTR_SKIP_NEXT_FLAG) ){
|
|
u8 *pLastKey;
|
|
int nLastKey;
|
|
int iLastTopic;
|
|
int res; /* Result of comparison */
|
|
Page *pNext;
|
|
|
|
/* Load the last key on the current page. */
|
|
pLastKey = pageGetKey(pPtr->pSeg,
|
|
pPtr->pPg, pPtr->nCell-1, &iLastTopic, &nLastKey, &pPtr->blob1
|
|
);
|
|
|
|
/* If the loaded key is >= than (pKey/nKey), break out of the loop.
|
|
** If (pKey/nKey) is present in this array, it must be on the current
|
|
** page. */
|
|
res = sortedKeyCompare(
|
|
xCmp, iLastTopic, pLastKey, nLastKey, iTopic, pKey, nKey
|
|
);
|
|
if( res>=0 ) break;
|
|
|
|
/* Advance to the next page that contains at least one key. */
|
|
pNext = pPtr->pPg;
|
|
lsmFsPageRef(pNext);
|
|
while( 1 ){
|
|
Page *pLoad;
|
|
u8 *aData; int nData;
|
|
|
|
rc = lsmFsDbPageNext(pPtr->pSeg, pNext, 1, &pLoad);
|
|
lsmFsPageRelease(pNext);
|
|
pNext = pLoad;
|
|
if( pNext==0 ) break;
|
|
|
|
assert( rc==LSM_OK );
|
|
aData = lsmFsPageData(pNext, &nData);
|
|
if( (pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG)==0
|
|
&& pageGetNRec(aData, nData)>0
|
|
){
|
|
break;
|
|
}
|
|
}
|
|
if( pNext==0 ) break;
|
|
segmentPtrSetPage(pPtr, pNext);
|
|
|
|
/* This should probably be an LSM_CORRUPT error. */
|
|
assert( rc!=LSM_OK || (pPtr->flags & PGFTR_SKIP_THIS_FLAG) );
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
static int ptrFwdPointer(
|
|
Page *pPage,
|
|
int iCell,
|
|
Segment *pSeg,
|
|
LsmPgno *piPtr,
|
|
int *pbFound
|
|
){
|
|
Page *pPg = pPage;
|
|
int iFirst = iCell;
|
|
int rc = LSM_OK;
|
|
|
|
do {
|
|
Page *pNext = 0;
|
|
u8 *aData;
|
|
int nData;
|
|
|
|
aData = lsmFsPageData(pPg, &nData);
|
|
if( (pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG)==0 ){
|
|
int i;
|
|
int nCell = pageGetNRec(aData, nData);
|
|
for(i=iFirst; i<nCell; i++){
|
|
u8 eType = *pageGetCell(aData, nData, i);
|
|
if( (eType & LSM_START_DELETE)==0 ){
|
|
*pbFound = 1;
|
|
*piPtr = pageGetRecordPtr(aData, nData, i) + pageGetPtr(aData, nData);
|
|
lsmFsPageRelease(pPg);
|
|
return LSM_OK;
|
|
}
|
|
}
|
|
}
|
|
|
|
rc = lsmFsDbPageNext(pSeg, pPg, 1, &pNext);
|
|
lsmFsPageRelease(pPg);
|
|
pPg = pNext;
|
|
iFirst = 0;
|
|
}while( pPg && rc==LSM_OK );
|
|
lsmFsPageRelease(pPg);
|
|
|
|
*pbFound = 0;
|
|
return rc;
|
|
}
|
|
|
|
static int sortedRhsFirst(MultiCursor *pCsr, Level *pLvl, SegmentPtr *pPtr){
|
|
int rc;
|
|
rc = segmentPtrEnd(pCsr, pPtr, 0);
|
|
while( pPtr->pPg && rc==LSM_OK ){
|
|
int res = sortedKeyCompare(pCsr->pDb->xCmp,
|
|
pLvl->iSplitTopic, pLvl->pSplitKey, pLvl->nSplitKey,
|
|
rtTopic(pPtr->eType), pPtr->pKey, pPtr->nKey
|
|
);
|
|
if( res<=0 ) break;
|
|
rc = segmentPtrAdvance(pCsr, pPtr, 0);
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
|
|
/*
|
|
** This function is called as part of a SEEK_GE op on a multi-cursor if the
|
|
** FC pointer read from segment *pPtr comes from an entry with the
|
|
** LSM_START_DELETE flag set. In this case the pointer value cannot be
|
|
** trusted. Instead, the pointer that should be followed is that associated
|
|
** with the next entry in *pPtr that does not have LSM_START_DELETE set.
|
|
**
|
|
** Why the pointers can't be trusted:
|
|
**
|
|
**
|
|
**
|
|
** TODO: This is a stop-gap solution:
|
|
**
|
|
** At the moment, this function is called from within segmentPtrSeek(),
|
|
** as part of the initial lsmMCursorSeek() call. However, consider a
|
|
** database where the following has occurred:
|
|
**
|
|
** 1. A range delete removes keys 1..9999 using a range delete.
|
|
** 2. Keys 1 through 9999 are reinserted.
|
|
** 3. The levels containing the ops in 1. and 2. above are merged. Call
|
|
** this level N. Level N contains FC pointers to level N+1.
|
|
**
|
|
** Then, if the user attempts to query for (key>=2 LIMIT 10), the
|
|
** lsmMCursorSeek() call will iterate through 9998 entries searching for a
|
|
** pointer down to the level N+1 that is never actually used. It would be
|
|
** much better if the multi-cursor could do this lazily - only seek to the
|
|
** level (N+1) page after the user has moved the cursor on level N passed
|
|
** the big range-delete.
|
|
*/
|
|
static int segmentPtrFwdPointer(
|
|
MultiCursor *pCsr, /* Multi-cursor pPtr belongs to */
|
|
SegmentPtr *pPtr, /* Segment-pointer to extract FC ptr from */
|
|
LsmPgno *piPtr /* OUT: FC pointer value */
|
|
){
|
|
Level *pLvl = pPtr->pLevel;
|
|
Level *pNext = pLvl->pNext;
|
|
Page *pPg = pPtr->pPg;
|
|
int rc;
|
|
int bFound;
|
|
LsmPgno iOut = 0;
|
|
|
|
if( pPtr->pSeg==&pLvl->lhs || pPtr->pSeg==&pLvl->aRhs[pLvl->nRight-1] ){
|
|
if( pNext==0
|
|
|| (pNext->nRight==0 && pNext->lhs.iRoot)
|
|
|| (pNext->nRight!=0 && pNext->aRhs[0].iRoot)
|
|
){
|
|
/* Do nothing. The pointer will not be used anyway. */
|
|
return LSM_OK;
|
|
}
|
|
}else{
|
|
if( pPtr[1].pSeg->iRoot ){
|
|
return LSM_OK;
|
|
}
|
|
}
|
|
|
|
/* Search for a pointer within the current segment. */
|
|
lsmFsPageRef(pPg);
|
|
rc = ptrFwdPointer(pPg, pPtr->iCell, pPtr->pSeg, &iOut, &bFound);
|
|
|
|
if( rc==LSM_OK && bFound==0 ){
|
|
/* This case happens when pPtr points to the left-hand-side of a segment
|
|
** currently undergoing an incremental merge. In this case, jump to the
|
|
** oldest segment in the right-hand-side of the same level and continue
|
|
** searching. But - do not consider any keys smaller than the levels
|
|
** split-key. */
|
|
SegmentPtr ptr;
|
|
|
|
if( pPtr->pLevel->nRight==0 || pPtr->pSeg!=&pPtr->pLevel->lhs ){
|
|
return LSM_CORRUPT_BKPT;
|
|
}
|
|
|
|
memset(&ptr, 0, sizeof(SegmentPtr));
|
|
ptr.pLevel = pPtr->pLevel;
|
|
ptr.pSeg = &ptr.pLevel->aRhs[ptr.pLevel->nRight-1];
|
|
rc = sortedRhsFirst(pCsr, ptr.pLevel, &ptr);
|
|
if( rc==LSM_OK ){
|
|
rc = ptrFwdPointer(ptr.pPg, ptr.iCell, ptr.pSeg, &iOut, &bFound);
|
|
ptr.pPg = 0;
|
|
}
|
|
segmentPtrReset(&ptr, 0);
|
|
}
|
|
|
|
*piPtr = iOut;
|
|
return rc;
|
|
}
|
|
|
|
static int segmentPtrSeek(
|
|
MultiCursor *pCsr, /* Cursor context */
|
|
SegmentPtr *pPtr, /* Pointer to seek */
|
|
int iTopic, /* Key topic to seek to */
|
|
void *pKey, int nKey, /* Key to seek to */
|
|
int eSeek, /* Search bias - see above */
|
|
LsmPgno *piPtr, /* OUT: FC pointer */
|
|
int *pbStop
|
|
){
|
|
int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp;
|
|
int res = 0; /* Result of comparison operation */
|
|
int rc = LSM_OK;
|
|
int iMin;
|
|
int iMax;
|
|
LsmPgno iPtrOut = 0;
|
|
|
|
/* If the current page contains an oversized entry, then there are no
|
|
** pointers to one or more of the subsequent pages in the sorted run.
|
|
** The following call ensures that the segment-ptr points to the correct
|
|
** page in this case. */
|
|
rc = segmentPtrSearchOversized(pCsr, pPtr, iTopic, pKey, nKey);
|
|
iPtrOut = pPtr->iPtr;
|
|
|
|
/* Assert that this page is the right page of this segment for the key
|
|
** that we are searching for. Do this by loading page (iPg-1) and testing
|
|
** that pKey/nKey is greater than all keys on that page, and then by
|
|
** loading (iPg+1) and testing that pKey/nKey is smaller than all
|
|
** the keys it houses.
|
|
**
|
|
** TODO: With range-deletes in the tree, the test described above may fail.
|
|
*/
|
|
#if 0
|
|
assert( assertKeyLocation(pCsr, pPtr, pKey, nKey) );
|
|
#endif
|
|
|
|
assert( pPtr->nCell>0
|
|
|| pPtr->pSeg->nSize==1
|
|
|| lsmFsDbPageIsLast(pPtr->pSeg, pPtr->pPg)
|
|
);
|
|
if( pPtr->nCell==0 ){
|
|
segmentPtrReset(pPtr, LSM_SEGMENTPTR_FREE_THRESHOLD);
|
|
}else{
|
|
iMin = 0;
|
|
iMax = pPtr->nCell-1;
|
|
|
|
while( 1 ){
|
|
int iTry = (iMin+iMax)/2;
|
|
void *pKeyT; int nKeyT; /* Key for cell iTry */
|
|
int iTopicT;
|
|
|
|
assert( iTry<iMax || iMin==iMax );
|
|
|
|
rc = segmentPtrLoadCell(pPtr, iTry);
|
|
if( rc!=LSM_OK ) break;
|
|
|
|
segmentPtrKey(pPtr, &pKeyT, &nKeyT);
|
|
iTopicT = rtTopic(pPtr->eType);
|
|
|
|
res = sortedKeyCompare(xCmp, iTopicT, pKeyT, nKeyT, iTopic, pKey, nKey);
|
|
if( res<=0 ){
|
|
iPtrOut = pPtr->iPtr + pPtr->iPgPtr;
|
|
}
|
|
|
|
if( res==0 || iMin==iMax ){
|
|
break;
|
|
}else if( res>0 ){
|
|
iMax = LSM_MAX(iTry-1, iMin);
|
|
}else{
|
|
iMin = iTry+1;
|
|
}
|
|
}
|
|
|
|
if( rc==LSM_OK ){
|
|
assert( res==0 || (iMin==iMax && iMin>=0 && iMin<pPtr->nCell) );
|
|
if( res ){
|
|
rc = segmentPtrLoadCell(pPtr, iMin);
|
|
}
|
|
assert( rc!=LSM_OK || res>0 || iPtrOut==(pPtr->iPtr + pPtr->iPgPtr) );
|
|
|
|
if( rc==LSM_OK ){
|
|
switch( eSeek ){
|
|
case LSM_SEEK_EQ: {
|
|
int eType = pPtr->eType;
|
|
if( (res<0 && (eType & LSM_START_DELETE))
|
|
|| (res>0 && (eType & LSM_END_DELETE))
|
|
|| (res==0 && (eType & LSM_POINT_DELETE))
|
|
){
|
|
*pbStop = 1;
|
|
}else if( res==0 && (eType & LSM_INSERT) ){
|
|
lsm_env *pEnv = pCsr->pDb->pEnv;
|
|
*pbStop = 1;
|
|
pCsr->eType = pPtr->eType;
|
|
rc = sortedBlobSet(pEnv, &pCsr->key, pPtr->pKey, pPtr->nKey);
|
|
if( rc==LSM_OK ){
|
|
rc = sortedBlobSet(pEnv, &pCsr->val, pPtr->pVal, pPtr->nVal);
|
|
}
|
|
pCsr->flags |= CURSOR_SEEK_EQ;
|
|
}
|
|
segmentPtrReset(pPtr, LSM_SEGMENTPTR_FREE_THRESHOLD);
|
|
break;
|
|
}
|
|
case LSM_SEEK_LE:
|
|
if( res>0 ) rc = segmentPtrAdvance(pCsr, pPtr, 1);
|
|
break;
|
|
case LSM_SEEK_GE: {
|
|
/* Figure out if we need to 'skip' the pointer forward or not */
|
|
if( (res<=0 && (pPtr->eType & LSM_START_DELETE))
|
|
|| (res>0 && (pPtr->eType & LSM_END_DELETE))
|
|
){
|
|
rc = segmentPtrFwdPointer(pCsr, pPtr, &iPtrOut);
|
|
}
|
|
if( res<0 && rc==LSM_OK ){
|
|
rc = segmentPtrAdvance(pCsr, pPtr, 0);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* If the cursor seek has found a separator key, and this cursor is
|
|
** supposed to ignore separators keys, advance to the next entry. */
|
|
if( rc==LSM_OK && pPtr->pPg
|
|
&& segmentPtrIgnoreSeparators(pCsr, pPtr)
|
|
&& rtIsSeparator(pPtr->eType)
|
|
){
|
|
assert( eSeek!=LSM_SEEK_EQ );
|
|
rc = segmentPtrAdvance(pCsr, pPtr, eSeek==LSM_SEEK_LE);
|
|
}
|
|
}
|
|
|
|
assert( rc!=LSM_OK || assertSeekResult(pCsr,pPtr,iTopic,pKey,nKey,eSeek) );
|
|
*piPtr = iPtrOut;
|
|
return rc;
|
|
}
|
|
|
|
static int seekInBtree(
|
|
MultiCursor *pCsr, /* Multi-cursor object */
|
|
Segment *pSeg, /* Seek within this segment */
|
|
int iTopic,
|
|
void *pKey, int nKey, /* Key to seek to */
|
|
LsmPgno *aPg, /* OUT: Page numbers */
|
|
Page **ppPg /* OUT: Leaf (sorted-run) page reference */
|
|
){
|
|
int i = 0;
|
|
int rc;
|
|
LsmPgno iPg;
|
|
Page *pPg = 0;
|
|
LsmBlob blob = {0, 0, 0};
|
|
|
|
iPg = pSeg->iRoot;
|
|
do {
|
|
LsmPgno *piFirst = 0;
|
|
if( aPg ){
|
|
aPg[i++] = iPg;
|
|
piFirst = &aPg[i];
|
|
}
|
|
|
|
rc = lsmFsDbPageGet(pCsr->pDb->pFS, pSeg, iPg, &pPg);
|
|
assert( rc==LSM_OK || pPg==0 );
|
|
if( rc==LSM_OK ){
|
|
u8 *aData; /* Buffer containing page data */
|
|
int nData; /* Size of aData[] in bytes */
|
|
int iMin;
|
|
int iMax;
|
|
int nRec;
|
|
int flags;
|
|
|
|
aData = fsPageData(pPg, &nData);
|
|
flags = pageGetFlags(aData, nData);
|
|
if( (flags & SEGMENT_BTREE_FLAG)==0 ) break;
|
|
|
|
iPg = pageGetPtr(aData, nData);
|
|
nRec = pageGetNRec(aData, nData);
|
|
|
|
iMin = 0;
|
|
iMax = nRec-1;
|
|
while( iMax>=iMin ){
|
|
int iTry = (iMin+iMax)/2;
|
|
void *pKeyT; int nKeyT; /* Key for cell iTry */
|
|
int iTopicT; /* Topic for key pKeyT/nKeyT */
|
|
LsmPgno iPtr; /* Pointer associated with cell iTry */
|
|
int res; /* (pKey - pKeyT) */
|
|
|
|
rc = pageGetBtreeKey(
|
|
pSeg, pPg, iTry, &iPtr, &iTopicT, &pKeyT, &nKeyT, &blob
|
|
);
|
|
if( rc!=LSM_OK ) break;
|
|
if( piFirst && pKeyT==blob.pData ){
|
|
*piFirst = pageGetBtreeRef(pPg, iTry);
|
|
piFirst = 0;
|
|
i++;
|
|
}
|
|
|
|
res = sortedKeyCompare(
|
|
pCsr->pDb->xCmp, iTopic, pKey, nKey, iTopicT, pKeyT, nKeyT
|
|
);
|
|
if( res<0 ){
|
|
iPg = iPtr;
|
|
iMax = iTry-1;
|
|
}else{
|
|
iMin = iTry+1;
|
|
}
|
|
}
|
|
lsmFsPageRelease(pPg);
|
|
pPg = 0;
|
|
}
|
|
}while( rc==LSM_OK );
|
|
|
|
sortedBlobFree(&blob);
|
|
assert( (rc==LSM_OK)==(pPg!=0) );
|
|
if( ppPg ){
|
|
*ppPg = pPg;
|
|
}else{
|
|
lsmFsPageRelease(pPg);
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
static int seekInSegment(
|
|
MultiCursor *pCsr,
|
|
SegmentPtr *pPtr,
|
|
int iTopic,
|
|
void *pKey, int nKey,
|
|
LsmPgno iPg, /* Page to search */
|
|
int eSeek, /* Search bias - see above */
|
|
LsmPgno *piPtr, /* OUT: FC pointer */
|
|
int *pbStop /* OUT: Stop search flag */
|
|
){
|
|
LsmPgno iPtr = iPg;
|
|
int rc = LSM_OK;
|
|
|
|
if( pPtr->pSeg->iRoot ){
|
|
Page *pPg;
|
|
assert( pPtr->pSeg->iRoot!=0 );
|
|
rc = seekInBtree(pCsr, pPtr->pSeg, iTopic, pKey, nKey, 0, &pPg);
|
|
if( rc==LSM_OK ) segmentPtrSetPage(pPtr, pPg);
|
|
}else{
|
|
if( iPtr==0 ){
|
|
iPtr = pPtr->pSeg->iFirst;
|
|
}
|
|
if( rc==LSM_OK ){
|
|
rc = segmentPtrLoadPage(pCsr->pDb->pFS, pPtr, iPtr);
|
|
}
|
|
}
|
|
|
|
if( rc==LSM_OK ){
|
|
rc = segmentPtrSeek(pCsr, pPtr, iTopic, pKey, nKey, eSeek, piPtr, pbStop);
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Seek each segment pointer in the array of (pLvl->nRight+1) at aPtr[].
|
|
**
|
|
** pbStop:
|
|
** This parameter is only significant if parameter eSeek is set to
|
|
** LSM_SEEK_EQ. In this case, it is set to true before returning if
|
|
** the seek operation is finished. This can happen in two ways:
|
|
**
|
|
** a) A key matching (pKey/nKey) is found, or
|
|
** b) A point-delete or range-delete deleting the key is found.
|
|
**
|
|
** In case (a), the multi-cursor CURSOR_SEEK_EQ flag is set and the pCsr->key
|
|
** and pCsr->val blobs populated before returning.
|
|
*/
|
|
static int seekInLevel(
|
|
MultiCursor *pCsr, /* Sorted cursor object to seek */
|
|
SegmentPtr *aPtr, /* Pointer to array of (nRhs+1) SPs */
|
|
int eSeek, /* Search bias - see above */
|
|
int iTopic, /* Key topic to search for */
|
|
void *pKey, int nKey, /* Key to search for */
|
|
LsmPgno *piPgno, /* IN/OUT: fraction cascade pointer (or 0) */
|
|
int *pbStop /* OUT: See above */
|
|
){
|
|
Level *pLvl = aPtr[0].pLevel; /* Level to seek within */
|
|
int rc = LSM_OK; /* Return code */
|
|
LsmPgno iOut = 0; /* Pointer to return to caller */
|
|
int res = -1; /* Result of xCmp(pKey, split) */
|
|
int nRhs = pLvl->nRight; /* Number of right-hand-side segments */
|
|
int bStop = 0;
|
|
|
|
/* If this is a composite level (one currently undergoing an incremental
|
|
** merge), figure out if the search key is larger or smaller than the
|
|
** levels split-key. */
|
|
if( nRhs ){
|
|
res = sortedKeyCompare(pCsr->pDb->xCmp, iTopic, pKey, nKey,
|
|
pLvl->iSplitTopic, pLvl->pSplitKey, pLvl->nSplitKey
|
|
);
|
|
}
|
|
|
|
/* If (res<0), then key pKey/nKey is smaller than the split-key (or this
|
|
** is not a composite level and there is no split-key). Search the
|
|
** left-hand-side of the level in this case. */
|
|
if( res<0 ){
|
|
int i;
|
|
LsmPgno iPtr = 0;
|
|
if( nRhs==0 ) iPtr = *piPgno;
|
|
|
|
rc = seekInSegment(
|
|
pCsr, &aPtr[0], iTopic, pKey, nKey, iPtr, eSeek, &iOut, &bStop
|
|
);
|
|
if( rc==LSM_OK && nRhs>0 && eSeek==LSM_SEEK_GE && aPtr[0].pPg==0 ){
|
|
res = 0;
|
|
}
|
|
for(i=1; i<=nRhs; i++){
|
|
segmentPtrReset(&aPtr[i], LSM_SEGMENTPTR_FREE_THRESHOLD);
|
|
}
|
|
}
|
|
|
|
if( res>=0 ){
|
|
int bHit = 0; /* True if at least one rhs is not EOF */
|
|
LsmPgno iPtr = *piPgno;
|
|
int i;
|
|
segmentPtrReset(&aPtr[0], LSM_SEGMENTPTR_FREE_THRESHOLD);
|
|
for(i=1; rc==LSM_OK && i<=nRhs && bStop==0; i++){
|
|
SegmentPtr *pPtr = &aPtr[i];
|
|
iOut = 0;
|
|
rc = seekInSegment(
|
|
pCsr, pPtr, iTopic, pKey, nKey, iPtr, eSeek, &iOut, &bStop
|
|
);
|
|
iPtr = iOut;
|
|
|
|
/* If the segment-pointer has settled on a key that is smaller than
|
|
** the splitkey, invalidate the segment-pointer. */
|
|
if( pPtr->pPg ){
|
|
res = sortedKeyCompare(pCsr->pDb->xCmp,
|
|
rtTopic(pPtr->eType), pPtr->pKey, pPtr->nKey,
|
|
pLvl->iSplitTopic, pLvl->pSplitKey, pLvl->nSplitKey
|
|
);
|
|
if( res<0 ){
|
|
if( pPtr->eType & LSM_START_DELETE ){
|
|
pPtr->eType &= ~LSM_INSERT;
|
|
pPtr->pKey = pLvl->pSplitKey;
|
|
pPtr->nKey = pLvl->nSplitKey;
|
|
pPtr->pVal = 0;
|
|
pPtr->nVal = 0;
|
|
}else{
|
|
segmentPtrReset(pPtr, LSM_SEGMENTPTR_FREE_THRESHOLD);
|
|
}
|
|
}
|
|
}
|
|
|
|
if( aPtr[i].pKey ) bHit = 1;
|
|
}
|
|
|
|
if( rc==LSM_OK && eSeek==LSM_SEEK_LE && bHit==0 ){
|
|
rc = segmentPtrEnd(pCsr, &aPtr[0], 1);
|
|
}
|
|
}
|
|
|
|
assert( eSeek==LSM_SEEK_EQ || bStop==0 );
|
|
*piPgno = iOut;
|
|
*pbStop = bStop;
|
|
return rc;
|
|
}
|
|
|
|
static void multiCursorGetKey(
|
|
MultiCursor *pCsr,
|
|
int iKey,
|
|
int *peType, /* OUT: Key type (SORTED_WRITE etc.) */
|
|
void **ppKey, /* OUT: Pointer to buffer containing key */
|
|
int *pnKey /* OUT: Size of *ppKey in bytes */
|
|
){
|
|
int nKey = 0;
|
|
void *pKey = 0;
|
|
int eType = 0;
|
|
|
|
switch( iKey ){
|
|
case CURSOR_DATA_TREE0:
|
|
case CURSOR_DATA_TREE1: {
|
|
TreeCursor *pTreeCsr = pCsr->apTreeCsr[iKey-CURSOR_DATA_TREE0];
|
|
if( lsmTreeCursorValid(pTreeCsr) ){
|
|
lsmTreeCursorKey(pTreeCsr, &eType, &pKey, &nKey);
|
|
}
|
|
break;
|
|
}
|
|
|
|
case CURSOR_DATA_SYSTEM: {
|
|
Snapshot *pWorker = pCsr->pDb->pWorker;
|
|
if( pWorker && (pCsr->flags & CURSOR_FLUSH_FREELIST) ){
|
|
int nEntry = pWorker->freelist.nEntry;
|
|
if( pCsr->iFree < (nEntry*2) ){
|
|
FreelistEntry *aEntry = pWorker->freelist.aEntry;
|
|
int i = nEntry - 1 - (pCsr->iFree / 2);
|
|
u32 iKey2 = 0;
|
|
|
|
if( (pCsr->iFree % 2) ){
|
|
eType = LSM_END_DELETE|LSM_SYSTEMKEY;
|
|
iKey2 = aEntry[i].iBlk-1;
|
|
}else if( aEntry[i].iId>=0 ){
|
|
eType = LSM_INSERT|LSM_SYSTEMKEY;
|
|
iKey2 = aEntry[i].iBlk;
|
|
|
|
/* If the in-memory entry immediately before this one was a
|
|
** DELETE, and the block number is one greater than the current
|
|
** block number, mark this entry as an "end-delete-range". */
|
|
if( i<(nEntry-1) && aEntry[i+1].iBlk==iKey2+1 && aEntry[i+1].iId<0 ){
|
|
eType |= LSM_END_DELETE;
|
|
}
|
|
|
|
}else{
|
|
eType = LSM_START_DELETE|LSM_SYSTEMKEY;
|
|
iKey2 = aEntry[i].iBlk + 1;
|
|
}
|
|
|
|
/* If the in-memory entry immediately after this one is a
|
|
** DELETE, and the block number is one less than the current
|
|
** key, mark this entry as an "start-delete-range". */
|
|
if( i>0 && aEntry[i-1].iBlk==iKey2-1 && aEntry[i-1].iId<0 ){
|
|
eType |= LSM_START_DELETE;
|
|
}
|
|
|
|
pKey = pCsr->pSystemVal;
|
|
nKey = 4;
|
|
lsmPutU32(pKey, ~iKey2);
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
|
|
default: {
|
|
int iPtr = iKey - CURSOR_DATA_SEGMENT;
|
|
assert( iPtr>=0 );
|
|
if( iPtr==pCsr->nPtr ){
|
|
if( pCsr->pBtCsr ){
|
|
pKey = pCsr->pBtCsr->pKey;
|
|
nKey = pCsr->pBtCsr->nKey;
|
|
eType = pCsr->pBtCsr->eType;
|
|
}
|
|
}else if( iPtr<pCsr->nPtr ){
|
|
SegmentPtr *pPtr = &pCsr->aPtr[iPtr];
|
|
if( pPtr->pPg ){
|
|
pKey = pPtr->pKey;
|
|
nKey = pPtr->nKey;
|
|
eType = pPtr->eType;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
if( peType ) *peType = eType;
|
|
if( pnKey ) *pnKey = nKey;
|
|
if( ppKey ) *ppKey = pKey;
|
|
}
|
|
|
|
static int sortedDbKeyCompare(
|
|
MultiCursor *pCsr,
|
|
int iLhsFlags, void *pLhsKey, int nLhsKey,
|
|
int iRhsFlags, void *pRhsKey, int nRhsKey
|
|
){
|
|
int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp;
|
|
int res;
|
|
|
|
/* Compare the keys, including the system flag. */
|
|
res = sortedKeyCompare(xCmp,
|
|
rtTopic(iLhsFlags), pLhsKey, nLhsKey,
|
|
rtTopic(iRhsFlags), pRhsKey, nRhsKey
|
|
);
|
|
|
|
/* If a key has the LSM_START_DELETE flag set, but not the LSM_INSERT or
|
|
** LSM_POINT_DELETE flags, it is considered a delta larger. This prevents
|
|
** the beginning of an open-ended set from masking a database entry or
|
|
** delete at a lower level. */
|
|
if( res==0 && (pCsr->flags & CURSOR_IGNORE_DELETE) ){
|
|
const int m = LSM_POINT_DELETE|LSM_INSERT|LSM_END_DELETE |LSM_START_DELETE;
|
|
int iDel1 = 0;
|
|
int iDel2 = 0;
|
|
|
|
if( LSM_START_DELETE==(iLhsFlags & m) ) iDel1 = +1;
|
|
if( LSM_END_DELETE ==(iLhsFlags & m) ) iDel1 = -1;
|
|
if( LSM_START_DELETE==(iRhsFlags & m) ) iDel2 = +1;
|
|
if( LSM_END_DELETE ==(iRhsFlags & m) ) iDel2 = -1;
|
|
|
|
res = (iDel1 - iDel2);
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
static void multiCursorDoCompare(MultiCursor *pCsr, int iOut, int bReverse){
|
|
int i1;
|
|
int i2;
|
|
int iRes;
|
|
void *pKey1; int nKey1; int eType1;
|
|
void *pKey2; int nKey2; int eType2;
|
|
const int mul = (bReverse ? -1 : 1);
|
|
|
|
assert( pCsr->aTree && iOut<pCsr->nTree );
|
|
if( iOut>=(pCsr->nTree/2) ){
|
|
i1 = (iOut - pCsr->nTree/2) * 2;
|
|
i2 = i1 + 1;
|
|
}else{
|
|
i1 = pCsr->aTree[iOut*2];
|
|
i2 = pCsr->aTree[iOut*2+1];
|
|
}
|
|
|
|
multiCursorGetKey(pCsr, i1, &eType1, &pKey1, &nKey1);
|
|
multiCursorGetKey(pCsr, i2, &eType2, &pKey2, &nKey2);
|
|
|
|
if( pKey1==0 ){
|
|
iRes = i2;
|
|
}else if( pKey2==0 ){
|
|
iRes = i1;
|
|
}else{
|
|
int res;
|
|
|
|
/* Compare the keys */
|
|
res = sortedDbKeyCompare(pCsr,
|
|
eType1, pKey1, nKey1, eType2, pKey2, nKey2
|
|
);
|
|
|
|
res = res * mul;
|
|
if( res==0 ){
|
|
/* The two keys are identical. Normally, this means that the key from
|
|
** the newer run clobbers the old. However, if the newer key is a
|
|
** separator key, or a range-delete-boundary only, do not allow it
|
|
** to clobber an older entry. */
|
|
int nc1 = (eType1 & (LSM_INSERT|LSM_POINT_DELETE))==0;
|
|
int nc2 = (eType2 & (LSM_INSERT|LSM_POINT_DELETE))==0;
|
|
iRes = (nc1 > nc2) ? i2 : i1;
|
|
}else if( res<0 ){
|
|
iRes = i1;
|
|
}else{
|
|
iRes = i2;
|
|
}
|
|
}
|
|
|
|
pCsr->aTree[iOut] = iRes;
|
|
}
|
|
|
|
/*
|
|
** This function advances segment pointer iPtr belonging to multi-cursor
|
|
** pCsr forward (bReverse==0) or backward (bReverse!=0).
|
|
**
|
|
** If the segment pointer points to a segment that is part of a composite
|
|
** level, then the following special case is handled.
|
|
**
|
|
** * If iPtr is the lhs of a composite level, and the cursor is being
|
|
** advanced forwards, and segment iPtr is at EOF, move all pointers
|
|
** that correspond to rhs segments of the same level to the first
|
|
** key in their respective data.
|
|
*/
|
|
static int segmentCursorAdvance(
|
|
MultiCursor *pCsr,
|
|
int iPtr,
|
|
int bReverse
|
|
){
|
|
int rc;
|
|
SegmentPtr *pPtr = &pCsr->aPtr[iPtr];
|
|
Level *pLvl = pPtr->pLevel;
|
|
int bComposite; /* True if pPtr is part of composite level */
|
|
|
|
/* Advance the segment-pointer object. */
|
|
rc = segmentPtrAdvance(pCsr, pPtr, bReverse);
|
|
if( rc!=LSM_OK ) return rc;
|
|
|
|
bComposite = (pLvl->nRight>0 && pCsr->nPtr>pLvl->nRight);
|
|
if( bComposite && pPtr->pPg==0 ){
|
|
int bFix = 0;
|
|
if( (bReverse==0)==(pPtr->pSeg==&pLvl->lhs) ){
|
|
int i;
|
|
if( bReverse ){
|
|
SegmentPtr *pLhs = &pCsr->aPtr[iPtr - 1 - (pPtr->pSeg - pLvl->aRhs)];
|
|
for(i=0; i<pLvl->nRight; i++){
|
|
if( pLhs[i+1].pPg ) break;
|
|
}
|
|
if( i==pLvl->nRight ){
|
|
bFix = 1;
|
|
rc = segmentPtrEnd(pCsr, pLhs, 1);
|
|
}
|
|
}else{
|
|
bFix = 1;
|
|
for(i=0; rc==LSM_OK && i<pLvl->nRight; i++){
|
|
rc = sortedRhsFirst(pCsr, pLvl, &pCsr->aPtr[iPtr+1+i]);
|
|
}
|
|
}
|
|
}
|
|
|
|
if( bFix ){
|
|
int i;
|
|
for(i=pCsr->nTree-1; i>0; i--){
|
|
multiCursorDoCompare(pCsr, i, bReverse);
|
|
}
|
|
}
|
|
}
|
|
|
|
#if 0
|
|
if( bComposite && pPtr->pSeg==&pLvl->lhs /* lhs of composite level */
|
|
&& bReverse==0 /* csr advanced forwards */
|
|
&& pPtr->pPg==0 /* segment at EOF */
|
|
){
|
|
int i;
|
|
for(i=0; rc==LSM_OK && i<pLvl->nRight; i++){
|
|
rc = sortedRhsFirst(pCsr, pLvl, &pCsr->aPtr[iPtr+1+i]);
|
|
}
|
|
for(i=pCsr->nTree-1; i>0; i--){
|
|
multiCursorDoCompare(pCsr, i, 0);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
return rc;
|
|
}
|
|
|
|
static void mcursorFreeComponents(MultiCursor *pCsr){
|
|
int i;
|
|
lsm_env *pEnv = pCsr->pDb->pEnv;
|
|
|
|
/* Close the tree cursor, if any. */
|
|
lsmTreeCursorDestroy(pCsr->apTreeCsr[0]);
|
|
lsmTreeCursorDestroy(pCsr->apTreeCsr[1]);
|
|
|
|
/* Reset the segment pointers */
|
|
for(i=0; i<pCsr->nPtr; i++){
|
|
segmentPtrReset(&pCsr->aPtr[i], 0);
|
|
}
|
|
|
|
/* And the b-tree cursor, if any */
|
|
btreeCursorFree(pCsr->pBtCsr);
|
|
|
|
/* Free allocations */
|
|
lsmFree(pEnv, pCsr->aPtr);
|
|
lsmFree(pEnv, pCsr->aTree);
|
|
lsmFree(pEnv, pCsr->pSystemVal);
|
|
|
|
/* Zero fields */
|
|
pCsr->nPtr = 0;
|
|
pCsr->aPtr = 0;
|
|
pCsr->nTree = 0;
|
|
pCsr->aTree = 0;
|
|
pCsr->pSystemVal = 0;
|
|
pCsr->apTreeCsr[0] = 0;
|
|
pCsr->apTreeCsr[1] = 0;
|
|
pCsr->pBtCsr = 0;
|
|
}
|
|
|
|
void lsmMCursorFreeCache(lsm_db *pDb){
|
|
MultiCursor *p;
|
|
MultiCursor *pNext;
|
|
for(p=pDb->pCsrCache; p; p=pNext){
|
|
pNext = p->pNext;
|
|
lsmMCursorClose(p, 0);
|
|
}
|
|
pDb->pCsrCache = 0;
|
|
}
|
|
|
|
/*
|
|
** Close the cursor passed as the first argument.
|
|
**
|
|
** If the bCache parameter is true, then shift the cursor to the pCsrCache
|
|
** list for possible reuse instead of actually deleting it.
|
|
*/
|
|
void lsmMCursorClose(MultiCursor *pCsr, int bCache){
|
|
if( pCsr ){
|
|
lsm_db *pDb = pCsr->pDb;
|
|
MultiCursor **pp; /* Iterator variable */
|
|
|
|
/* The cursor may or may not be currently part of the linked list
|
|
** starting at lsm_db.pCsr. If it is, extract it. */
|
|
for(pp=&pDb->pCsr; *pp; pp=&((*pp)->pNext)){
|
|
if( *pp==pCsr ){
|
|
*pp = pCsr->pNext;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if( bCache ){
|
|
int i; /* Used to iterate through segment-pointers */
|
|
|
|
/* Release any page references held by this cursor. */
|
|
assert( !pCsr->pBtCsr );
|
|
for(i=0; i<pCsr->nPtr; i++){
|
|
SegmentPtr *pPtr = &pCsr->aPtr[i];
|
|
lsmFsPageRelease(pPtr->pPg);
|
|
pPtr->pPg = 0;
|
|
}
|
|
|
|
/* Reset the tree cursors */
|
|
lsmTreeCursorReset(pCsr->apTreeCsr[0]);
|
|
lsmTreeCursorReset(pCsr->apTreeCsr[1]);
|
|
|
|
/* Add the cursor to the pCsrCache list */
|
|
pCsr->pNext = pDb->pCsrCache;
|
|
pDb->pCsrCache = pCsr;
|
|
}else{
|
|
/* Free the allocation used to cache the current key, if any. */
|
|
sortedBlobFree(&pCsr->key);
|
|
sortedBlobFree(&pCsr->val);
|
|
|
|
/* Free the component cursors */
|
|
mcursorFreeComponents(pCsr);
|
|
|
|
/* Free the cursor structure itself */
|
|
lsmFree(pDb->pEnv, pCsr);
|
|
}
|
|
}
|
|
}
|
|
|
|
#define TREE_NONE 0
|
|
#define TREE_OLD 1
|
|
#define TREE_BOTH 2
|
|
|
|
/*
|
|
** Parameter eTree is one of TREE_OLD or TREE_BOTH.
|
|
*/
|
|
static int multiCursorAddTree(MultiCursor *pCsr, Snapshot *pSnap, int eTree){
|
|
int rc = LSM_OK;
|
|
lsm_db *db = pCsr->pDb;
|
|
|
|
/* Add a tree cursor on the 'old' tree, if it exists. */
|
|
if( eTree!=TREE_NONE
|
|
&& lsmTreeHasOld(db)
|
|
&& db->treehdr.iOldLog!=pSnap->iLogOff
|
|
){
|
|
rc = lsmTreeCursorNew(db, 1, &pCsr->apTreeCsr[1]);
|
|
}
|
|
|
|
/* Add a tree cursor on the 'current' tree, if required. */
|
|
if( rc==LSM_OK && eTree==TREE_BOTH ){
|
|
rc = lsmTreeCursorNew(db, 0, &pCsr->apTreeCsr[0]);
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
static int multiCursorAddRhs(MultiCursor *pCsr, Level *pLvl){
|
|
int i;
|
|
int nRhs = pLvl->nRight;
|
|
|
|
assert( pLvl->nRight>0 );
|
|
assert( pCsr->aPtr==0 );
|
|
pCsr->aPtr = lsmMallocZero(pCsr->pDb->pEnv, sizeof(SegmentPtr) * nRhs);
|
|
if( !pCsr->aPtr ) return LSM_NOMEM_BKPT;
|
|
pCsr->nPtr = nRhs;
|
|
|
|
for(i=0; i<nRhs; i++){
|
|
pCsr->aPtr[i].pSeg = &pLvl->aRhs[i];
|
|
pCsr->aPtr[i].pLevel = pLvl;
|
|
}
|
|
|
|
return LSM_OK;
|
|
}
|
|
|
|
static void multiCursorAddOne(MultiCursor *pCsr, Level *pLvl, int *pRc){
|
|
if( *pRc==LSM_OK ){
|
|
int iPtr = pCsr->nPtr;
|
|
int i;
|
|
pCsr->aPtr[iPtr].pLevel = pLvl;
|
|
pCsr->aPtr[iPtr].pSeg = &pLvl->lhs;
|
|
iPtr++;
|
|
for(i=0; i<pLvl->nRight; i++){
|
|
pCsr->aPtr[iPtr].pLevel = pLvl;
|
|
pCsr->aPtr[iPtr].pSeg = &pLvl->aRhs[i];
|
|
iPtr++;
|
|
}
|
|
|
|
if( pLvl->nRight && pLvl->pSplitKey==0 ){
|
|
sortedSplitkey(pCsr->pDb, pLvl, pRc);
|
|
}
|
|
pCsr->nPtr = iPtr;
|
|
}
|
|
}
|
|
|
|
static int multiCursorAddAll(MultiCursor *pCsr, Snapshot *pSnap){
|
|
Level *pLvl;
|
|
int nPtr = 0;
|
|
int rc = LSM_OK;
|
|
|
|
for(pLvl=pSnap->pLevel; pLvl; pLvl=pLvl->pNext){
|
|
/* If the LEVEL_INCOMPLETE flag is set, then this function is being
|
|
** called (indirectly) from within a sortedNewToplevel() call to
|
|
** construct pLvl. In this case ignore pLvl - this cursor is going to
|
|
** be used to retrieve a freelist entry from the LSM, and the partially
|
|
** complete level may confuse it. */
|
|
if( pLvl->flags & LEVEL_INCOMPLETE ) continue;
|
|
nPtr += (1 + pLvl->nRight);
|
|
}
|
|
|
|
assert( pCsr->aPtr==0 );
|
|
pCsr->aPtr = lsmMallocZeroRc(pCsr->pDb->pEnv, sizeof(SegmentPtr) * nPtr, &rc);
|
|
|
|
for(pLvl=pSnap->pLevel; pLvl; pLvl=pLvl->pNext){
|
|
if( (pLvl->flags & LEVEL_INCOMPLETE)==0 ){
|
|
multiCursorAddOne(pCsr, pLvl, &rc);
|
|
}
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
static int multiCursorInit(MultiCursor *pCsr, Snapshot *pSnap){
|
|
int rc;
|
|
rc = multiCursorAddAll(pCsr, pSnap);
|
|
if( rc==LSM_OK ){
|
|
rc = multiCursorAddTree(pCsr, pSnap, TREE_BOTH);
|
|
}
|
|
pCsr->flags |= (CURSOR_IGNORE_SYSTEM | CURSOR_IGNORE_DELETE);
|
|
return rc;
|
|
}
|
|
|
|
static MultiCursor *multiCursorNew(lsm_db *db, int *pRc){
|
|
MultiCursor *pCsr;
|
|
pCsr = (MultiCursor *)lsmMallocZeroRc(db->pEnv, sizeof(MultiCursor), pRc);
|
|
if( pCsr ){
|
|
pCsr->pNext = db->pCsr;
|
|
db->pCsr = pCsr;
|
|
pCsr->pDb = db;
|
|
}
|
|
return pCsr;
|
|
}
|
|
|
|
|
|
void lsmSortedRemap(lsm_db *pDb){
|
|
MultiCursor *pCsr;
|
|
for(pCsr=pDb->pCsr; pCsr; pCsr=pCsr->pNext){
|
|
int iPtr;
|
|
if( pCsr->pBtCsr ){
|
|
btreeCursorLoadKey(pCsr->pBtCsr);
|
|
}
|
|
for(iPtr=0; iPtr<pCsr->nPtr; iPtr++){
|
|
segmentPtrLoadCell(&pCsr->aPtr[iPtr], pCsr->aPtr[iPtr].iCell);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void multiCursorReadSeparators(MultiCursor *pCsr){
|
|
if( pCsr->nPtr>0 ){
|
|
pCsr->flags |= CURSOR_READ_SEPARATORS;
|
|
}
|
|
}
|
|
|
|
/*
|
|
** Have this cursor skip over SORTED_DELETE entries.
|
|
*/
|
|
static void multiCursorIgnoreDelete(MultiCursor *pCsr){
|
|
if( pCsr ) pCsr->flags |= CURSOR_IGNORE_DELETE;
|
|
}
|
|
|
|
/*
|
|
** If the free-block list is not empty, then have this cursor visit a key
|
|
** with (a) the system bit set, and (b) the key "FREELIST" and (c) a value
|
|
** blob containing the serialized free-block list.
|
|
*/
|
|
static int multiCursorVisitFreelist(MultiCursor *pCsr){
|
|
int rc = LSM_OK;
|
|
pCsr->flags |= CURSOR_FLUSH_FREELIST;
|
|
pCsr->pSystemVal = lsmMallocRc(pCsr->pDb->pEnv, 4 + 8, &rc);
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Allocate and return a new database cursor.
|
|
**
|
|
** This method should only be called to allocate user cursors. As it may
|
|
** recycle a cursor from lsm_db.pCsrCache.
|
|
*/
|
|
int lsmMCursorNew(
|
|
lsm_db *pDb, /* Database handle */
|
|
MultiCursor **ppCsr /* OUT: Allocated cursor */
|
|
){
|
|
MultiCursor *pCsr = 0;
|
|
int rc = LSM_OK;
|
|
|
|
if( pDb->pCsrCache ){
|
|
int bOld; /* True if there is an old in-memory tree */
|
|
|
|
/* Remove a cursor from the pCsrCache list and add it to the open list. */
|
|
pCsr = pDb->pCsrCache;
|
|
pDb->pCsrCache = pCsr->pNext;
|
|
pCsr->pNext = pDb->pCsr;
|
|
pDb->pCsr = pCsr;
|
|
|
|
/* The cursor can almost be used as is, except that the old in-memory
|
|
** tree cursor may be present and not required, or required and not
|
|
** present. Fix this if required. */
|
|
bOld = (lsmTreeHasOld(pDb) && pDb->treehdr.iOldLog!=pDb->pClient->iLogOff);
|
|
if( !bOld && pCsr->apTreeCsr[1] ){
|
|
lsmTreeCursorDestroy(pCsr->apTreeCsr[1]);
|
|
pCsr->apTreeCsr[1] = 0;
|
|
}else if( bOld && !pCsr->apTreeCsr[1] ){
|
|
rc = lsmTreeCursorNew(pDb, 1, &pCsr->apTreeCsr[1]);
|
|
}
|
|
|
|
pCsr->flags = (CURSOR_IGNORE_SYSTEM | CURSOR_IGNORE_DELETE);
|
|
|
|
}else{
|
|
pCsr = multiCursorNew(pDb, &rc);
|
|
if( rc==LSM_OK ) rc = multiCursorInit(pCsr, pDb->pClient);
|
|
}
|
|
|
|
if( rc!=LSM_OK ){
|
|
lsmMCursorClose(pCsr, 0);
|
|
pCsr = 0;
|
|
}
|
|
assert( (rc==LSM_OK)==(pCsr!=0) );
|
|
*ppCsr = pCsr;
|
|
return rc;
|
|
}
|
|
|
|
static int multiCursorGetVal(
|
|
MultiCursor *pCsr,
|
|
int iVal,
|
|
void **ppVal,
|
|
int *pnVal
|
|
){
|
|
int rc = LSM_OK;
|
|
|
|
*ppVal = 0;
|
|
*pnVal = 0;
|
|
|
|
switch( iVal ){
|
|
case CURSOR_DATA_TREE0:
|
|
case CURSOR_DATA_TREE1: {
|
|
TreeCursor *pTreeCsr = pCsr->apTreeCsr[iVal-CURSOR_DATA_TREE0];
|
|
if( lsmTreeCursorValid(pTreeCsr) ){
|
|
lsmTreeCursorValue(pTreeCsr, ppVal, pnVal);
|
|
}else{
|
|
*ppVal = 0;
|
|
*pnVal = 0;
|
|
}
|
|
break;
|
|
}
|
|
|
|
case CURSOR_DATA_SYSTEM: {
|
|
Snapshot *pWorker = pCsr->pDb->pWorker;
|
|
if( pWorker
|
|
&& (pCsr->iFree % 2)==0
|
|
&& pCsr->iFree < (pWorker->freelist.nEntry*2)
|
|
){
|
|
int iEntry = pWorker->freelist.nEntry - 1 - (pCsr->iFree / 2);
|
|
u8 *aVal = &((u8 *)(pCsr->pSystemVal))[4];
|
|
lsmPutU64(aVal, pWorker->freelist.aEntry[iEntry].iId);
|
|
*ppVal = aVal;
|
|
*pnVal = 8;
|
|
}
|
|
break;
|
|
}
|
|
|
|
default: {
|
|
int iPtr = iVal-CURSOR_DATA_SEGMENT;
|
|
if( iPtr<pCsr->nPtr ){
|
|
SegmentPtr *pPtr = &pCsr->aPtr[iPtr];
|
|
if( pPtr->pPg ){
|
|
*ppVal = pPtr->pVal;
|
|
*pnVal = pPtr->nVal;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
assert( rc==LSM_OK || (*ppVal==0 && *pnVal==0) );
|
|
return rc;
|
|
}
|
|
|
|
static int multiCursorAdvance(MultiCursor *pCsr, int bReverse);
|
|
|
|
/*
|
|
** This function is called by worker connections to walk the part of the
|
|
** free-list stored within the LSM data structure.
|
|
*/
|
|
int lsmSortedWalkFreelist(
|
|
lsm_db *pDb, /* Database handle */
|
|
int bReverse, /* True to iterate from largest to smallest */
|
|
int (*x)(void *, int, i64), /* Callback function */
|
|
void *pCtx /* First argument to pass to callback */
|
|
){
|
|
MultiCursor *pCsr; /* Cursor used to read db */
|
|
int rc = LSM_OK; /* Return Code */
|
|
Snapshot *pSnap = 0;
|
|
|
|
assert( pDb->pWorker );
|
|
if( pDb->bIncrMerge ){
|
|
rc = lsmCheckpointDeserialize(pDb, 0, pDb->pShmhdr->aSnap1, &pSnap);
|
|
if( rc!=LSM_OK ) return rc;
|
|
}else{
|
|
pSnap = pDb->pWorker;
|
|
}
|
|
|
|
pCsr = multiCursorNew(pDb, &rc);
|
|
if( pCsr ){
|
|
rc = multiCursorAddAll(pCsr, pSnap);
|
|
pCsr->flags |= CURSOR_IGNORE_DELETE;
|
|
}
|
|
|
|
if( rc==LSM_OK ){
|
|
if( bReverse==0 ){
|
|
rc = lsmMCursorLast(pCsr);
|
|
}else{
|
|
rc = lsmMCursorSeek(pCsr, 1, "", 0, LSM_SEEK_GE);
|
|
}
|
|
|
|
while( rc==LSM_OK && lsmMCursorValid(pCsr) && rtIsSystem(pCsr->eType) ){
|
|
void *pKey; int nKey;
|
|
void *pVal = 0; int nVal = 0;
|
|
|
|
rc = lsmMCursorKey(pCsr, &pKey, &nKey);
|
|
if( rc==LSM_OK ) rc = lsmMCursorValue(pCsr, &pVal, &nVal);
|
|
if( rc==LSM_OK && (nKey!=4 || nVal!=8) ) rc = LSM_CORRUPT_BKPT;
|
|
|
|
if( rc==LSM_OK ){
|
|
int iBlk;
|
|
i64 iSnap;
|
|
iBlk = (int)(~(lsmGetU32((u8 *)pKey)));
|
|
iSnap = (i64)lsmGetU64((u8 *)pVal);
|
|
if( x(pCtx, iBlk, iSnap) ) break;
|
|
rc = multiCursorAdvance(pCsr, !bReverse);
|
|
}
|
|
}
|
|
}
|
|
|
|
lsmMCursorClose(pCsr, 0);
|
|
if( pSnap!=pDb->pWorker ){
|
|
lsmFreeSnapshot(pDb->pEnv, pSnap);
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
int lsmSortedLoadFreelist(
|
|
lsm_db *pDb, /* Database handle (must be worker) */
|
|
void **ppVal, /* OUT: Blob containing LSM free-list */
|
|
int *pnVal /* OUT: Size of *ppVal blob in bytes */
|
|
){
|
|
MultiCursor *pCsr; /* Cursor used to retreive free-list */
|
|
int rc = LSM_OK; /* Return Code */
|
|
|
|
assert( pDb->pWorker );
|
|
assert( *ppVal==0 && *pnVal==0 );
|
|
|
|
pCsr = multiCursorNew(pDb, &rc);
|
|
if( pCsr ){
|
|
rc = multiCursorAddAll(pCsr, pDb->pWorker);
|
|
pCsr->flags |= CURSOR_IGNORE_DELETE;
|
|
}
|
|
|
|
if( rc==LSM_OK ){
|
|
rc = lsmMCursorLast(pCsr);
|
|
if( rc==LSM_OK
|
|
&& rtIsWrite(pCsr->eType) && rtIsSystem(pCsr->eType)
|
|
&& pCsr->key.nData==8
|
|
&& 0==memcmp(pCsr->key.pData, "FREELIST", 8)
|
|
){
|
|
void *pVal; int nVal; /* Value read from database */
|
|
rc = lsmMCursorValue(pCsr, &pVal, &nVal);
|
|
if( rc==LSM_OK ){
|
|
*ppVal = lsmMallocRc(pDb->pEnv, nVal, &rc);
|
|
if( *ppVal ){
|
|
memcpy(*ppVal, pVal, nVal);
|
|
*pnVal = nVal;
|
|
}
|
|
}
|
|
}
|
|
|
|
lsmMCursorClose(pCsr, 0);
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
static int multiCursorAllocTree(MultiCursor *pCsr){
|
|
int rc = LSM_OK;
|
|
if( pCsr->aTree==0 ){
|
|
int nByte; /* Bytes of space to allocate */
|
|
int nMin; /* Total number of cursors being merged */
|
|
|
|
nMin = CURSOR_DATA_SEGMENT + pCsr->nPtr + (pCsr->pBtCsr!=0);
|
|
pCsr->nTree = 2;
|
|
while( pCsr->nTree<nMin ){
|
|
pCsr->nTree = pCsr->nTree*2;
|
|
}
|
|
|
|
nByte = sizeof(int)*pCsr->nTree*2;
|
|
pCsr->aTree = (int *)lsmMallocZeroRc(pCsr->pDb->pEnv, nByte, &rc);
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
static void multiCursorCacheKey(MultiCursor *pCsr, int *pRc){
|
|
if( *pRc==LSM_OK ){
|
|
void *pKey;
|
|
int nKey;
|
|
multiCursorGetKey(pCsr, pCsr->aTree[1], &pCsr->eType, &pKey, &nKey);
|
|
*pRc = sortedBlobSet(pCsr->pDb->pEnv, &pCsr->key, pKey, nKey);
|
|
}
|
|
}
|
|
|
|
#ifdef LSM_DEBUG_EXPENSIVE
|
|
static void assertCursorTree(MultiCursor *pCsr){
|
|
int bRev = !!(pCsr->flags & CURSOR_PREV_OK);
|
|
int *aSave = pCsr->aTree;
|
|
int nSave = pCsr->nTree;
|
|
int rc;
|
|
|
|
pCsr->aTree = 0;
|
|
pCsr->nTree = 0;
|
|
rc = multiCursorAllocTree(pCsr);
|
|
if( rc==LSM_OK ){
|
|
int i;
|
|
for(i=pCsr->nTree-1; i>0; i--){
|
|
multiCursorDoCompare(pCsr, i, bRev);
|
|
}
|
|
|
|
assert( nSave==pCsr->nTree
|
|
&& 0==memcmp(aSave, pCsr->aTree, sizeof(int)*nSave)
|
|
);
|
|
|
|
lsmFree(pCsr->pDb->pEnv, pCsr->aTree);
|
|
}
|
|
|
|
pCsr->aTree = aSave;
|
|
pCsr->nTree = nSave;
|
|
}
|
|
#else
|
|
# define assertCursorTree(x)
|
|
#endif
|
|
|
|
static int mcursorLocationOk(MultiCursor *pCsr, int bDeleteOk){
|
|
int eType = pCsr->eType;
|
|
int iKey;
|
|
int i;
|
|
int rdmask;
|
|
|
|
assert( pCsr->flags & (CURSOR_NEXT_OK|CURSOR_PREV_OK) );
|
|
assertCursorTree(pCsr);
|
|
|
|
rdmask = (pCsr->flags & CURSOR_NEXT_OK) ? LSM_END_DELETE : LSM_START_DELETE;
|
|
|
|
/* If the cursor does not currently point to an actual database key (i.e.
|
|
** it points to a delete key, or the start or end of a range-delete), and
|
|
** the CURSOR_IGNORE_DELETE flag is set, skip past this entry. */
|
|
if( (pCsr->flags & CURSOR_IGNORE_DELETE) && bDeleteOk==0 ){
|
|
if( (eType & LSM_INSERT)==0 ) return 0;
|
|
}
|
|
|
|
/* If the cursor points to a system key (free-list entry), and the
|
|
** CURSOR_IGNORE_SYSTEM flag is set, skip thie entry. */
|
|
if( (pCsr->flags & CURSOR_IGNORE_SYSTEM) && rtTopic(eType)!=0 ){
|
|
return 0;
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
/* This block fires assert() statements to check one of the assumptions
|
|
** in the comment below - that if the lhs sub-cursor of a level undergoing
|
|
** a merge is valid, then all the rhs sub-cursors must be at EOF.
|
|
**
|
|
** Also assert that all rhs sub-cursors are either at EOF or point to
|
|
** a key that is not less than the level split-key. */
|
|
for(i=0; i<pCsr->nPtr; i++){
|
|
SegmentPtr *pPtr = &pCsr->aPtr[i];
|
|
Level *pLvl = pPtr->pLevel;
|
|
if( pLvl->nRight && pPtr->pPg ){
|
|
if( pPtr->pSeg==&pLvl->lhs ){
|
|
int j;
|
|
for(j=0; j<pLvl->nRight; j++) assert( pPtr[j+1].pPg==0 );
|
|
}else{
|
|
int res = sortedKeyCompare(pCsr->pDb->xCmp,
|
|
rtTopic(pPtr->eType), pPtr->pKey, pPtr->nKey,
|
|
pLvl->iSplitTopic, pLvl->pSplitKey, pLvl->nSplitKey
|
|
);
|
|
assert( res>=0 );
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
/* Now check if this key has already been deleted by a range-delete. If
|
|
** so, skip past it.
|
|
**
|
|
** Assume, for the moment, that the tree contains no levels currently
|
|
** undergoing incremental merge, and that this cursor is iterating forwards
|
|
** through the database keys. The cursor currently points to a key in
|
|
** level L. This key has already been deleted if any of the sub-cursors
|
|
** that point to levels newer than L (or to the in-memory tree) point to
|
|
** a key greater than the current key with the LSM_END_DELETE flag set.
|
|
**
|
|
** Or, if the cursor is iterating backwards through data keys, if any
|
|
** such sub-cursor points to a key smaller than the current key with the
|
|
** LSM_START_DELETE flag set.
|
|
**
|
|
** Why it works with levels undergoing a merge too:
|
|
**
|
|
** When a cursor iterates forwards, the sub-cursors for the rhs of a
|
|
** level are only activated once the lhs reaches EOF. So when iterating
|
|
** forwards, the keys visited are the same as if the level was completely
|
|
** merged.
|
|
**
|
|
** If the cursor is iterating backwards, then the lhs sub-cursor is not
|
|
** initialized until the last of the rhs sub-cursors has reached EOF.
|
|
** Additionally, if the START_DELETE flag is set on the last entry (in
|
|
** reverse order - so the entry with the smallest key) of a rhs sub-cursor,
|
|
** then a pseudo-key equal to the levels split-key with the END_DELETE
|
|
** flag set is visited by the sub-cursor.
|
|
*/
|
|
iKey = pCsr->aTree[1];
|
|
for(i=0; i<iKey; i++){
|
|
int csrflags;
|
|
multiCursorGetKey(pCsr, i, &csrflags, 0, 0);
|
|
if( (rdmask & csrflags) ){
|
|
const int SD_ED = (LSM_START_DELETE|LSM_END_DELETE);
|
|
if( (csrflags & SD_ED)==SD_ED
|
|
|| (pCsr->flags & CURSOR_IGNORE_DELETE)==0
|
|
){
|
|
void *pKey; int nKey;
|
|
multiCursorGetKey(pCsr, i, 0, &pKey, &nKey);
|
|
if( 0==sortedKeyCompare(pCsr->pDb->xCmp,
|
|
rtTopic(eType), pCsr->key.pData, pCsr->key.nData,
|
|
rtTopic(csrflags), pKey, nKey
|
|
)){
|
|
continue;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
/* The current cursor position is one this cursor should visit. Return 1. */
|
|
return 1;
|
|
}
|
|
|
|
static int multiCursorSetupTree(MultiCursor *pCsr, int bRev){
|
|
int rc;
|
|
|
|
rc = multiCursorAllocTree(pCsr);
|
|
if( rc==LSM_OK ){
|
|
int i;
|
|
for(i=pCsr->nTree-1; i>0; i--){
|
|
multiCursorDoCompare(pCsr, i, bRev);
|
|
}
|
|
}
|
|
|
|
assertCursorTree(pCsr);
|
|
multiCursorCacheKey(pCsr, &rc);
|
|
|
|
if( rc==LSM_OK && mcursorLocationOk(pCsr, 0)==0 ){
|
|
rc = multiCursorAdvance(pCsr, bRev);
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
|
|
static int multiCursorEnd(MultiCursor *pCsr, int bLast){
|
|
int rc = LSM_OK;
|
|
int i;
|
|
|
|
pCsr->flags &= ~(CURSOR_NEXT_OK | CURSOR_PREV_OK | CURSOR_SEEK_EQ);
|
|
pCsr->flags |= (bLast ? CURSOR_PREV_OK : CURSOR_NEXT_OK);
|
|
pCsr->iFree = 0;
|
|
|
|
/* Position the two in-memory tree cursors */
|
|
for(i=0; rc==LSM_OK && i<2; i++){
|
|
if( pCsr->apTreeCsr[i] ){
|
|
rc = lsmTreeCursorEnd(pCsr->apTreeCsr[i], bLast);
|
|
}
|
|
}
|
|
|
|
for(i=0; rc==LSM_OK && i<pCsr->nPtr; i++){
|
|
SegmentPtr *pPtr = &pCsr->aPtr[i];
|
|
Level *pLvl = pPtr->pLevel;
|
|
int iRhs;
|
|
int bHit = 0;
|
|
|
|
if( bLast ){
|
|
for(iRhs=0; iRhs<pLvl->nRight && rc==LSM_OK; iRhs++){
|
|
rc = segmentPtrEnd(pCsr, &pPtr[iRhs+1], 1);
|
|
if( pPtr[iRhs+1].pPg ) bHit = 1;
|
|
}
|
|
if( bHit==0 && rc==LSM_OK ){
|
|
rc = segmentPtrEnd(pCsr, pPtr, 1);
|
|
}else{
|
|
segmentPtrReset(pPtr, LSM_SEGMENTPTR_FREE_THRESHOLD);
|
|
}
|
|
}else{
|
|
int bLhs = (pPtr->pSeg==&pLvl->lhs);
|
|
assert( pPtr->pSeg==&pLvl->lhs || pPtr->pSeg==&pLvl->aRhs[0] );
|
|
|
|
if( bLhs ){
|
|
rc = segmentPtrEnd(pCsr, pPtr, 0);
|
|
if( pPtr->pKey ) bHit = 1;
|
|
}
|
|
for(iRhs=0; iRhs<pLvl->nRight && rc==LSM_OK; iRhs++){
|
|
if( bHit ){
|
|
segmentPtrReset(&pPtr[iRhs+1], LSM_SEGMENTPTR_FREE_THRESHOLD);
|
|
}else{
|
|
rc = sortedRhsFirst(pCsr, pLvl, &pPtr[iRhs+bLhs]);
|
|
}
|
|
}
|
|
}
|
|
i += pLvl->nRight;
|
|
}
|
|
|
|
/* And the b-tree cursor, if applicable */
|
|
if( rc==LSM_OK && pCsr->pBtCsr ){
|
|
assert( bLast==0 );
|
|
rc = btreeCursorFirst(pCsr->pBtCsr);
|
|
}
|
|
|
|
if( rc==LSM_OK ){
|
|
rc = multiCursorSetupTree(pCsr, bLast);
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
|
|
int mcursorSave(MultiCursor *pCsr){
|
|
int rc = LSM_OK;
|
|
if( pCsr->aTree ){
|
|
int iTree = pCsr->aTree[1];
|
|
if( iTree==CURSOR_DATA_TREE0 || iTree==CURSOR_DATA_TREE1 ){
|
|
multiCursorCacheKey(pCsr, &rc);
|
|
}
|
|
}
|
|
mcursorFreeComponents(pCsr);
|
|
return rc;
|
|
}
|
|
|
|
int mcursorRestore(lsm_db *pDb, MultiCursor *pCsr){
|
|
int rc;
|
|
rc = multiCursorInit(pCsr, pDb->pClient);
|
|
if( rc==LSM_OK && pCsr->key.pData ){
|
|
rc = lsmMCursorSeek(pCsr,
|
|
rtTopic(pCsr->eType), pCsr->key.pData, pCsr->key.nData, +1
|
|
);
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
int lsmSaveCursors(lsm_db *pDb){
|
|
int rc = LSM_OK;
|
|
MultiCursor *pCsr;
|
|
|
|
for(pCsr=pDb->pCsr; rc==LSM_OK && pCsr; pCsr=pCsr->pNext){
|
|
rc = mcursorSave(pCsr);
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
int lsmRestoreCursors(lsm_db *pDb){
|
|
int rc = LSM_OK;
|
|
MultiCursor *pCsr;
|
|
|
|
for(pCsr=pDb->pCsr; rc==LSM_OK && pCsr; pCsr=pCsr->pNext){
|
|
rc = mcursorRestore(pDb, pCsr);
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
int lsmMCursorFirst(MultiCursor *pCsr){
|
|
return multiCursorEnd(pCsr, 0);
|
|
}
|
|
|
|
int lsmMCursorLast(MultiCursor *pCsr){
|
|
return multiCursorEnd(pCsr, 1);
|
|
}
|
|
|
|
lsm_db *lsmMCursorDb(MultiCursor *pCsr){
|
|
return pCsr->pDb;
|
|
}
|
|
|
|
void lsmMCursorReset(MultiCursor *pCsr){
|
|
int i;
|
|
lsmTreeCursorReset(pCsr->apTreeCsr[0]);
|
|
lsmTreeCursorReset(pCsr->apTreeCsr[1]);
|
|
for(i=0; i<pCsr->nPtr; i++){
|
|
segmentPtrReset(&pCsr->aPtr[i], LSM_SEGMENTPTR_FREE_THRESHOLD);
|
|
}
|
|
pCsr->key.nData = 0;
|
|
}
|
|
|
|
static int treeCursorSeek(
|
|
MultiCursor *pCsr,
|
|
TreeCursor *pTreeCsr,
|
|
void *pKey, int nKey,
|
|
int eSeek,
|
|
int *pbStop
|
|
){
|
|
int rc = LSM_OK;
|
|
if( pTreeCsr ){
|
|
int res = 0;
|
|
lsmTreeCursorSeek(pTreeCsr, pKey, nKey, &res);
|
|
switch( eSeek ){
|
|
case LSM_SEEK_EQ: {
|
|
int eType = lsmTreeCursorFlags(pTreeCsr);
|
|
if( (res<0 && (eType & LSM_START_DELETE))
|
|
|| (res>0 && (eType & LSM_END_DELETE))
|
|
|| (res==0 && (eType & LSM_POINT_DELETE))
|
|
){
|
|
*pbStop = 1;
|
|
}else if( res==0 && (eType & LSM_INSERT) ){
|
|
lsm_env *pEnv = pCsr->pDb->pEnv;
|
|
void *p; int n; /* Key/value from tree-cursor */
|
|
*pbStop = 1;
|
|
pCsr->flags |= CURSOR_SEEK_EQ;
|
|
rc = lsmTreeCursorKey(pTreeCsr, &pCsr->eType, &p, &n);
|
|
if( rc==LSM_OK ) rc = sortedBlobSet(pEnv, &pCsr->key, p, n);
|
|
if( rc==LSM_OK ) rc = lsmTreeCursorValue(pTreeCsr, &p, &n);
|
|
if( rc==LSM_OK ) rc = sortedBlobSet(pEnv, &pCsr->val, p, n);
|
|
}
|
|
lsmTreeCursorReset(pTreeCsr);
|
|
break;
|
|
}
|
|
case LSM_SEEK_GE:
|
|
if( res<0 && lsmTreeCursorValid(pTreeCsr) ){
|
|
lsmTreeCursorNext(pTreeCsr);
|
|
}
|
|
break;
|
|
default:
|
|
if( res>0 ){
|
|
assert( lsmTreeCursorValid(pTreeCsr) );
|
|
lsmTreeCursorPrev(pTreeCsr);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
|
|
/*
|
|
** Seek the cursor.
|
|
*/
|
|
int lsmMCursorSeek(
|
|
MultiCursor *pCsr,
|
|
int iTopic,
|
|
void *pKey, int nKey,
|
|
int eSeek
|
|
){
|
|
int eESeek = eSeek; /* Effective eSeek parameter */
|
|
int bStop = 0; /* Set to true to halt search operation */
|
|
int rc = LSM_OK; /* Return code */
|
|
int iPtr = 0; /* Used to iterate through pCsr->aPtr[] */
|
|
LsmPgno iPgno = 0; /* FC pointer value */
|
|
|
|
assert( pCsr->apTreeCsr[0]==0 || iTopic==0 );
|
|
assert( pCsr->apTreeCsr[1]==0 || iTopic==0 );
|
|
|
|
if( eESeek==LSM_SEEK_LEFAST ) eESeek = LSM_SEEK_LE;
|
|
|
|
assert( eESeek==LSM_SEEK_EQ || eESeek==LSM_SEEK_LE || eESeek==LSM_SEEK_GE );
|
|
assert( (pCsr->flags & CURSOR_FLUSH_FREELIST)==0 );
|
|
assert( pCsr->nPtr==0 || pCsr->aPtr[0].pLevel );
|
|
|
|
pCsr->flags &= ~(CURSOR_NEXT_OK | CURSOR_PREV_OK | CURSOR_SEEK_EQ);
|
|
rc = treeCursorSeek(pCsr, pCsr->apTreeCsr[0], pKey, nKey, eESeek, &bStop);
|
|
if( rc==LSM_OK && bStop==0 ){
|
|
rc = treeCursorSeek(pCsr, pCsr->apTreeCsr[1], pKey, nKey, eESeek, &bStop);
|
|
}
|
|
|
|
/* Seek all segment pointers. */
|
|
for(iPtr=0; iPtr<pCsr->nPtr && rc==LSM_OK && bStop==0; iPtr++){
|
|
SegmentPtr *pPtr = &pCsr->aPtr[iPtr];
|
|
assert( pPtr->pSeg==&pPtr->pLevel->lhs );
|
|
rc = seekInLevel(pCsr, pPtr, eESeek, iTopic, pKey, nKey, &iPgno, &bStop);
|
|
iPtr += pPtr->pLevel->nRight;
|
|
}
|
|
|
|
if( eSeek!=LSM_SEEK_EQ ){
|
|
if( rc==LSM_OK ){
|
|
rc = multiCursorAllocTree(pCsr);
|
|
}
|
|
if( rc==LSM_OK ){
|
|
int i;
|
|
for(i=pCsr->nTree-1; i>0; i--){
|
|
multiCursorDoCompare(pCsr, i, eESeek==LSM_SEEK_LE);
|
|
}
|
|
if( eSeek==LSM_SEEK_GE ) pCsr->flags |= CURSOR_NEXT_OK;
|
|
if( eSeek==LSM_SEEK_LE ) pCsr->flags |= CURSOR_PREV_OK;
|
|
}
|
|
|
|
multiCursorCacheKey(pCsr, &rc);
|
|
if( rc==LSM_OK && eSeek!=LSM_SEEK_LEFAST && 0==mcursorLocationOk(pCsr, 0) ){
|
|
switch( eESeek ){
|
|
case LSM_SEEK_EQ:
|
|
lsmMCursorReset(pCsr);
|
|
break;
|
|
case LSM_SEEK_GE:
|
|
rc = lsmMCursorNext(pCsr);
|
|
break;
|
|
default:
|
|
rc = lsmMCursorPrev(pCsr);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
int lsmMCursorValid(MultiCursor *pCsr){
|
|
int res = 0;
|
|
if( pCsr->flags & CURSOR_SEEK_EQ ){
|
|
res = 1;
|
|
}else if( pCsr->aTree ){
|
|
int iKey = pCsr->aTree[1];
|
|
if( iKey==CURSOR_DATA_TREE0 || iKey==CURSOR_DATA_TREE1 ){
|
|
res = lsmTreeCursorValid(pCsr->apTreeCsr[iKey-CURSOR_DATA_TREE0]);
|
|
}else{
|
|
void *pKey;
|
|
multiCursorGetKey(pCsr, iKey, 0, &pKey, 0);
|
|
res = pKey!=0;
|
|
}
|
|
}
|
|
return res;
|
|
}
|
|
|
|
static int mcursorAdvanceOk(
|
|
MultiCursor *pCsr,
|
|
int bReverse,
|
|
int *pRc
|
|
){
|
|
void *pNew; /* Pointer to buffer containing new key */
|
|
int nNew; /* Size of buffer pNew in bytes */
|
|
int eNewType; /* Type of new record */
|
|
|
|
if( *pRc ) return 1;
|
|
|
|
/* Check the current key value. If it is not greater than (if bReverse==0)
|
|
** or less than (if bReverse!=0) the key currently cached in pCsr->key,
|
|
** then the cursor has not yet been successfully advanced.
|
|
*/
|
|
multiCursorGetKey(pCsr, pCsr->aTree[1], &eNewType, &pNew, &nNew);
|
|
if( pNew ){
|
|
int typemask = (pCsr->flags & CURSOR_IGNORE_DELETE) ? ~(0) : LSM_SYSTEMKEY;
|
|
int res = sortedDbKeyCompare(pCsr,
|
|
eNewType & typemask, pNew, nNew,
|
|
pCsr->eType & typemask, pCsr->key.pData, pCsr->key.nData
|
|
);
|
|
|
|
if( (bReverse==0 && res<=0) || (bReverse!=0 && res>=0) ){
|
|
return 0;
|
|
}
|
|
|
|
multiCursorCacheKey(pCsr, pRc);
|
|
assert( pCsr->eType==eNewType );
|
|
|
|
/* If this cursor is configured to skip deleted keys, and the current
|
|
** cursor points to a SORTED_DELETE entry, then the cursor has not been
|
|
** successfully advanced.
|
|
**
|
|
** Similarly, if the cursor is configured to skip system keys and the
|
|
** current cursor points to a system key, it has not yet been advanced.
|
|
*/
|
|
if( *pRc==LSM_OK && 0==mcursorLocationOk(pCsr, 0) ) return 0;
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
static void flCsrAdvance(MultiCursor *pCsr){
|
|
assert( pCsr->flags & CURSOR_FLUSH_FREELIST );
|
|
if( pCsr->iFree % 2 ){
|
|
pCsr->iFree++;
|
|
}else{
|
|
int nEntry = pCsr->pDb->pWorker->freelist.nEntry;
|
|
FreelistEntry *aEntry = pCsr->pDb->pWorker->freelist.aEntry;
|
|
|
|
int i = nEntry - 1 - (pCsr->iFree / 2);
|
|
|
|
/* If the current entry is a delete and the "end-delete" key will not
|
|
** be attached to the next entry, increment iFree by 1 only. */
|
|
if( aEntry[i].iId<0 ){
|
|
while( 1 ){
|
|
if( i==0 || aEntry[i-1].iBlk!=aEntry[i].iBlk-1 ){
|
|
pCsr->iFree--;
|
|
break;
|
|
}
|
|
if( aEntry[i-1].iId>=0 ) break;
|
|
pCsr->iFree += 2;
|
|
i--;
|
|
}
|
|
}
|
|
pCsr->iFree += 2;
|
|
}
|
|
}
|
|
|
|
static int multiCursorAdvance(MultiCursor *pCsr, int bReverse){
|
|
int rc = LSM_OK; /* Return Code */
|
|
if( lsmMCursorValid(pCsr) ){
|
|
do {
|
|
int iKey = pCsr->aTree[1];
|
|
|
|
assertCursorTree(pCsr);
|
|
|
|
/* If this multi-cursor is advancing forwards, and the sub-cursor
|
|
** being advanced is the one that separator keys may be being read
|
|
** from, record the current absolute pointer value. */
|
|
if( pCsr->pPrevMergePtr ){
|
|
if( iKey==(CURSOR_DATA_SEGMENT+pCsr->nPtr) ){
|
|
assert( pCsr->pBtCsr );
|
|
*pCsr->pPrevMergePtr = pCsr->pBtCsr->iPtr;
|
|
}else if( pCsr->pBtCsr==0 && pCsr->nPtr>0
|
|
&& iKey==(CURSOR_DATA_SEGMENT+pCsr->nPtr-1)
|
|
){
|
|
SegmentPtr *pPtr = &pCsr->aPtr[iKey-CURSOR_DATA_SEGMENT];
|
|
*pCsr->pPrevMergePtr = pPtr->iPtr+pPtr->iPgPtr;
|
|
}
|
|
}
|
|
|
|
if( iKey==CURSOR_DATA_TREE0 || iKey==CURSOR_DATA_TREE1 ){
|
|
TreeCursor *pTreeCsr = pCsr->apTreeCsr[iKey-CURSOR_DATA_TREE0];
|
|
if( bReverse ){
|
|
rc = lsmTreeCursorPrev(pTreeCsr);
|
|
}else{
|
|
rc = lsmTreeCursorNext(pTreeCsr);
|
|
}
|
|
}else if( iKey==CURSOR_DATA_SYSTEM ){
|
|
assert( pCsr->flags & CURSOR_FLUSH_FREELIST );
|
|
assert( bReverse==0 );
|
|
flCsrAdvance(pCsr);
|
|
}else if( iKey==(CURSOR_DATA_SEGMENT+pCsr->nPtr) ){
|
|
assert( bReverse==0 && pCsr->pBtCsr );
|
|
rc = btreeCursorNext(pCsr->pBtCsr);
|
|
}else{
|
|
rc = segmentCursorAdvance(pCsr, iKey-CURSOR_DATA_SEGMENT, bReverse);
|
|
}
|
|
if( rc==LSM_OK ){
|
|
int i;
|
|
for(i=(iKey+pCsr->nTree)/2; i>0; i=i/2){
|
|
multiCursorDoCompare(pCsr, i, bReverse);
|
|
}
|
|
assertCursorTree(pCsr);
|
|
}
|
|
}while( mcursorAdvanceOk(pCsr, bReverse, &rc)==0 );
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
int lsmMCursorNext(MultiCursor *pCsr){
|
|
if( (pCsr->flags & CURSOR_NEXT_OK)==0 ) return LSM_MISUSE_BKPT;
|
|
return multiCursorAdvance(pCsr, 0);
|
|
}
|
|
|
|
int lsmMCursorPrev(MultiCursor *pCsr){
|
|
if( (pCsr->flags & CURSOR_PREV_OK)==0 ) return LSM_MISUSE_BKPT;
|
|
return multiCursorAdvance(pCsr, 1);
|
|
}
|
|
|
|
int lsmMCursorKey(MultiCursor *pCsr, void **ppKey, int *pnKey){
|
|
if( (pCsr->flags & CURSOR_SEEK_EQ) || pCsr->aTree==0 ){
|
|
*pnKey = pCsr->key.nData;
|
|
*ppKey = pCsr->key.pData;
|
|
}else{
|
|
int iKey = pCsr->aTree[1];
|
|
|
|
if( iKey==CURSOR_DATA_TREE0 || iKey==CURSOR_DATA_TREE1 ){
|
|
TreeCursor *pTreeCsr = pCsr->apTreeCsr[iKey-CURSOR_DATA_TREE0];
|
|
lsmTreeCursorKey(pTreeCsr, 0, ppKey, pnKey);
|
|
}else{
|
|
int nKey;
|
|
|
|
#ifndef NDEBUG
|
|
void *pKey;
|
|
int eType;
|
|
multiCursorGetKey(pCsr, iKey, &eType, &pKey, &nKey);
|
|
assert( eType==pCsr->eType );
|
|
assert( nKey==pCsr->key.nData );
|
|
assert( memcmp(pKey, pCsr->key.pData, nKey)==0 );
|
|
#endif
|
|
|
|
nKey = pCsr->key.nData;
|
|
if( nKey==0 ){
|
|
*ppKey = 0;
|
|
}else{
|
|
*ppKey = pCsr->key.pData;
|
|
}
|
|
*pnKey = nKey;
|
|
}
|
|
}
|
|
return LSM_OK;
|
|
}
|
|
|
|
/*
|
|
** Compare the current key that cursor csr points to with pKey/nKey. Set
|
|
** *piRes to the result and return LSM_OK.
|
|
*/
|
|
int lsm_csr_cmp(lsm_cursor *csr, const void *pKey, int nKey, int *piRes){
|
|
MultiCursor *pCsr = (MultiCursor *)csr;
|
|
void *pCsrkey; int nCsrkey;
|
|
int rc;
|
|
rc = lsmMCursorKey(pCsr, &pCsrkey, &nCsrkey);
|
|
if( rc==LSM_OK ){
|
|
int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp;
|
|
*piRes = sortedKeyCompare(xCmp, 0, pCsrkey, nCsrkey, 0, (void *)pKey, nKey);
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
int lsmMCursorValue(MultiCursor *pCsr, void **ppVal, int *pnVal){
|
|
void *pVal;
|
|
int nVal;
|
|
int rc;
|
|
if( (pCsr->flags & CURSOR_SEEK_EQ) || pCsr->aTree==0 ){
|
|
rc = LSM_OK;
|
|
nVal = pCsr->val.nData;
|
|
pVal = pCsr->val.pData;
|
|
}else{
|
|
|
|
assert( pCsr->aTree );
|
|
assert( mcursorLocationOk(pCsr, (pCsr->flags & CURSOR_IGNORE_DELETE)) );
|
|
|
|
rc = multiCursorGetVal(pCsr, pCsr->aTree[1], &pVal, &nVal);
|
|
if( pVal && rc==LSM_OK ){
|
|
rc = sortedBlobSet(pCsr->pDb->pEnv, &pCsr->val, pVal, nVal);
|
|
pVal = pCsr->val.pData;
|
|
}
|
|
|
|
if( rc!=LSM_OK ){
|
|
pVal = 0;
|
|
nVal = 0;
|
|
}
|
|
}
|
|
*ppVal = pVal;
|
|
*pnVal = nVal;
|
|
return rc;
|
|
}
|
|
|
|
int lsmMCursorType(MultiCursor *pCsr, int *peType){
|
|
assert( pCsr->aTree );
|
|
multiCursorGetKey(pCsr, pCsr->aTree[1], peType, 0, 0);
|
|
return LSM_OK;
|
|
}
|
|
|
|
/*
|
|
** Buffer aData[], size nData, is assumed to contain a valid b-tree
|
|
** hierarchy page image. Return the offset in aData[] of the next free
|
|
** byte in the data area (where a new cell may be written if there is
|
|
** space).
|
|
*/
|
|
static int mergeWorkerPageOffset(u8 *aData, int nData){
|
|
int nRec;
|
|
int iOff;
|
|
int nKey;
|
|
int eType;
|
|
i64 nDummy;
|
|
|
|
|
|
nRec = lsmGetU16(&aData[SEGMENT_NRECORD_OFFSET(nData)]);
|
|
iOff = lsmGetU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, nRec-1)]);
|
|
eType = aData[iOff++];
|
|
assert( eType==0
|
|
|| eType==(LSM_SYSTEMKEY|LSM_SEPARATOR)
|
|
|| eType==(LSM_SEPARATOR)
|
|
);
|
|
|
|
iOff += lsmVarintGet64(&aData[iOff], &nDummy);
|
|
iOff += lsmVarintGet32(&aData[iOff], &nKey);
|
|
|
|
return iOff + (eType ? nKey : 0);
|
|
}
|
|
|
|
/*
|
|
** Following a checkpoint operation, database pages that are part of the
|
|
** checkpointed state of the LSM are deemed read-only. This includes the
|
|
** right-most page of the b-tree hierarchy of any separators array under
|
|
** construction, and all pages between it and the b-tree root, inclusive.
|
|
** This is a problem, as when further pages are appended to the separators
|
|
** array, entries must be added to the indicated b-tree hierarchy pages.
|
|
**
|
|
** This function copies all such b-tree pages to new locations, so that
|
|
** they can be modified as required.
|
|
**
|
|
** The complication is that not all database pages are the same size - due
|
|
** to the way the file.c module works some (the first and last in each block)
|
|
** are 4 bytes smaller than the others.
|
|
*/
|
|
static int mergeWorkerMoveHierarchy(
|
|
MergeWorker *pMW, /* Merge worker */
|
|
int bSep /* True for separators run */
|
|
){
|
|
lsm_db *pDb = pMW->pDb; /* Database handle */
|
|
int rc = LSM_OK; /* Return code */
|
|
int i;
|
|
Page **apHier = pMW->hier.apHier;
|
|
int nHier = pMW->hier.nHier;
|
|
|
|
for(i=0; rc==LSM_OK && i<nHier; i++){
|
|
Page *pNew = 0;
|
|
rc = lsmFsSortedAppend(pDb->pFS, pDb->pWorker, pMW->pLevel, 1, &pNew);
|
|
assert( rc==LSM_OK );
|
|
|
|
if( rc==LSM_OK ){
|
|
u8 *a1; int n1;
|
|
u8 *a2; int n2;
|
|
|
|
a1 = fsPageData(pNew, &n1);
|
|
a2 = fsPageData(apHier[i], &n2);
|
|
|
|
assert( n1==n2 || n1+4==n2 );
|
|
|
|
if( n1==n2 ){
|
|
memcpy(a1, a2, n2);
|
|
}else{
|
|
int nEntry = pageGetNRec(a2, n2);
|
|
int iEof1 = SEGMENT_EOF(n1, nEntry);
|
|
int iEof2 = SEGMENT_EOF(n2, nEntry);
|
|
|
|
memcpy(a1, a2, iEof2 - 4);
|
|
memcpy(&a1[iEof1], &a2[iEof2], n2 - iEof2);
|
|
}
|
|
|
|
lsmFsPageRelease(apHier[i]);
|
|
apHier[i] = pNew;
|
|
|
|
#if 0
|
|
assert( n1==n2 || n1+4==n2 || n2+4==n1 );
|
|
if( n1>=n2 ){
|
|
/* If n1 (size of the new page) is equal to or greater than n2 (the
|
|
** size of the old page), then copy the data into the new page. If
|
|
** n1==n2, this could be done with a single memcpy(). However,
|
|
** since sometimes n1>n2, the page content and footer must be copied
|
|
** separately. */
|
|
int nEntry = pageGetNRec(a2, n2);
|
|
int iEof1 = SEGMENT_EOF(n1, nEntry);
|
|
int iEof2 = SEGMENT_EOF(n2, nEntry);
|
|
memcpy(a1, a2, iEof2);
|
|
memcpy(&a1[iEof1], &a2[iEof2], n2 - iEof2);
|
|
lsmFsPageRelease(apHier[i]);
|
|
apHier[i] = pNew;
|
|
}else{
|
|
lsmPutU16(&a1[SEGMENT_FLAGS_OFFSET(n1)], SEGMENT_BTREE_FLAG);
|
|
lsmPutU16(&a1[SEGMENT_NRECORD_OFFSET(n1)], 0);
|
|
lsmPutU64(&a1[SEGMENT_POINTER_OFFSET(n1)], 0);
|
|
i = i - 1;
|
|
lsmFsPageRelease(pNew);
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
|
|
#ifdef LSM_DEBUG
|
|
if( rc==LSM_OK ){
|
|
for(i=0; i<nHier; i++) assert( lsmFsPageWritable(apHier[i]) );
|
|
}
|
|
#endif
|
|
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Allocate and populate the MergeWorker.apHier[] array.
|
|
*/
|
|
static int mergeWorkerLoadHierarchy(MergeWorker *pMW){
|
|
int rc = LSM_OK;
|
|
Segment *pSeg;
|
|
Hierarchy *p;
|
|
|
|
pSeg = &pMW->pLevel->lhs;
|
|
p = &pMW->hier;
|
|
|
|
if( p->apHier==0 && pSeg->iRoot!=0 ){
|
|
FileSystem *pFS = pMW->pDb->pFS;
|
|
lsm_env *pEnv = pMW->pDb->pEnv;
|
|
Page **apHier = 0;
|
|
int nHier = 0;
|
|
LsmPgno iPg = pSeg->iRoot;
|
|
|
|
do {
|
|
Page *pPg = 0;
|
|
u8 *aData;
|
|
int nData;
|
|
int flags;
|
|
|
|
rc = lsmFsDbPageGet(pFS, pSeg, iPg, &pPg);
|
|
if( rc!=LSM_OK ) break;
|
|
|
|
aData = fsPageData(pPg, &nData);
|
|
flags = pageGetFlags(aData, nData);
|
|
if( flags&SEGMENT_BTREE_FLAG ){
|
|
Page **apNew = (Page **)lsmRealloc(
|
|
pEnv, apHier, sizeof(Page *)*(nHier+1)
|
|
);
|
|
if( apNew==0 ){
|
|
rc = LSM_NOMEM_BKPT;
|
|
break;
|
|
}
|
|
apHier = apNew;
|
|
memmove(&apHier[1], &apHier[0], sizeof(Page *) * nHier);
|
|
nHier++;
|
|
|
|
apHier[0] = pPg;
|
|
iPg = pageGetPtr(aData, nData);
|
|
}else{
|
|
lsmFsPageRelease(pPg);
|
|
break;
|
|
}
|
|
}while( 1 );
|
|
|
|
if( rc==LSM_OK ){
|
|
u8 *aData;
|
|
int nData;
|
|
aData = fsPageData(apHier[0], &nData);
|
|
pMW->aSave[0].iPgno = pageGetPtr(aData, nData);
|
|
p->nHier = nHier;
|
|
p->apHier = apHier;
|
|
rc = mergeWorkerMoveHierarchy(pMW, 0);
|
|
}else{
|
|
int i;
|
|
for(i=0; i<nHier; i++){
|
|
lsmFsPageRelease(apHier[i]);
|
|
}
|
|
lsmFree(pEnv, apHier);
|
|
}
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** B-tree pages use almost the same format as regular pages. The
|
|
** differences are:
|
|
**
|
|
** 1. The record format is (usually, see below) as follows:
|
|
**
|
|
** + Type byte (always SORTED_SEPARATOR or SORTED_SYSTEM_SEPARATOR),
|
|
** + Absolute pointer value (varint),
|
|
** + Number of bytes in key (varint),
|
|
** + LsmBlob containing key data.
|
|
**
|
|
** 2. All pointer values are stored as absolute values (not offsets
|
|
** relative to the footer pointer value).
|
|
**
|
|
** 3. Each pointer that is part of a record points to a page that
|
|
** contains keys smaller than the records key (note: not "equal to or
|
|
** smaller than - smaller than").
|
|
**
|
|
** 4. The pointer in the page footer of a b-tree page points to a page
|
|
** that contains keys equal to or larger than the largest key on the
|
|
** b-tree page.
|
|
**
|
|
** The reason for having the page footer pointer point to the right-child
|
|
** (instead of the left) is that doing things this way makes the
|
|
** mergeWorkerMoveHierarchy() operation less complicated (since the pointers
|
|
** that need to be updated are all stored as fixed-size integers within the
|
|
** page footer, not varints in page records).
|
|
**
|
|
** Records may not span b-tree pages. If this function is called to add a
|
|
** record larger than (page-size / 4) bytes, then a pointer to the indexed
|
|
** array page that contains the main record is added to the b-tree instead.
|
|
** In this case the record format is:
|
|
**
|
|
** + 0x00 byte (1 byte)
|
|
** + Absolute pointer value (varint),
|
|
** + Absolute page number of page containing key (varint).
|
|
**
|
|
** See function seekInBtree() for the code that traverses b-tree pages.
|
|
*/
|
|
|
|
static int mergeWorkerBtreeWrite(
|
|
MergeWorker *pMW,
|
|
u8 eType,
|
|
LsmPgno iPtr,
|
|
LsmPgno iKeyPg,
|
|
void *pKey,
|
|
int nKey
|
|
){
|
|
Hierarchy *p = &pMW->hier;
|
|
lsm_db *pDb = pMW->pDb; /* Database handle */
|
|
int rc = LSM_OK; /* Return Code */
|
|
int iLevel; /* Level of b-tree hierachy to write to */
|
|
int nData; /* Size of aData[] in bytes */
|
|
u8 *aData; /* Page data for level iLevel */
|
|
int iOff; /* Offset on b-tree page to write record to */
|
|
int nRec; /* Initial number of records on b-tree page */
|
|
|
|
/* iKeyPg should be zero for an ordinary b-tree key, or non-zero for an
|
|
** indirect key. The flags byte for an indirect key is 0x00. */
|
|
assert( (eType==0)==(iKeyPg!=0) );
|
|
|
|
/* The MergeWorker.apHier[] array contains the right-most leaf of the b-tree
|
|
** hierarchy, the root node, and all nodes that lie on the path between.
|
|
** apHier[0] is the right-most leaf and apHier[pMW->nHier-1] is the current
|
|
** root page.
|
|
**
|
|
** This loop searches for a node with enough space to store the key on,
|
|
** starting with the leaf and iterating up towards the root. When the loop
|
|
** exits, the key may be written to apHier[iLevel]. */
|
|
for(iLevel=0; iLevel<=p->nHier; iLevel++){
|
|
int nByte; /* Number of free bytes required */
|
|
|
|
if( iLevel==p->nHier ){
|
|
/* Extend the array and allocate a new root page. */
|
|
Page **aNew;
|
|
aNew = (Page **)lsmRealloc(
|
|
pMW->pDb->pEnv, p->apHier, sizeof(Page *)*(p->nHier+1)
|
|
);
|
|
if( !aNew ){
|
|
return LSM_NOMEM_BKPT;
|
|
}
|
|
p->apHier = aNew;
|
|
}else{
|
|
Page *pOld;
|
|
int nFree;
|
|
|
|
/* If the key will fit on this page, break out of the loop here.
|
|
** The new entry will be written to page apHier[iLevel]. */
|
|
pOld = p->apHier[iLevel];
|
|
assert( lsmFsPageWritable(pOld) );
|
|
aData = fsPageData(pOld, &nData);
|
|
if( eType==0 ){
|
|
nByte = 2 + 1 + lsmVarintLen64(iPtr) + lsmVarintLen64(iKeyPg);
|
|
}else{
|
|
nByte = 2 + 1 + lsmVarintLen64(iPtr) + lsmVarintLen32(nKey) + nKey;
|
|
}
|
|
|
|
nRec = pageGetNRec(aData, nData);
|
|
nFree = SEGMENT_EOF(nData, nRec) - mergeWorkerPageOffset(aData, nData);
|
|
if( nByte<=nFree ) break;
|
|
|
|
/* Otherwise, this page is full. Set the right-hand-child pointer
|
|
** to iPtr and release it. */
|
|
lsmPutU64(&aData[SEGMENT_POINTER_OFFSET(nData)], iPtr);
|
|
assert( lsmFsPageNumber(pOld)==0 );
|
|
rc = lsmFsPagePersist(pOld);
|
|
if( rc==LSM_OK ){
|
|
iPtr = lsmFsPageNumber(pOld);
|
|
lsmFsPageRelease(pOld);
|
|
}
|
|
}
|
|
|
|
/* Allocate a new page for apHier[iLevel]. */
|
|
p->apHier[iLevel] = 0;
|
|
if( rc==LSM_OK ){
|
|
rc = lsmFsSortedAppend(
|
|
pDb->pFS, pDb->pWorker, pMW->pLevel, 1, &p->apHier[iLevel]
|
|
);
|
|
}
|
|
if( rc!=LSM_OK ) return rc;
|
|
|
|
aData = fsPageData(p->apHier[iLevel], &nData);
|
|
memset(aData, 0, nData);
|
|
lsmPutU16(&aData[SEGMENT_FLAGS_OFFSET(nData)], SEGMENT_BTREE_FLAG);
|
|
lsmPutU16(&aData[SEGMENT_NRECORD_OFFSET(nData)], 0);
|
|
|
|
if( iLevel==p->nHier ){
|
|
p->nHier++;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* Write the key into page apHier[iLevel]. */
|
|
aData = fsPageData(p->apHier[iLevel], &nData);
|
|
iOff = mergeWorkerPageOffset(aData, nData);
|
|
nRec = pageGetNRec(aData, nData);
|
|
lsmPutU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, nRec)], (u16)iOff);
|
|
lsmPutU16(&aData[SEGMENT_NRECORD_OFFSET(nData)], (u16)(nRec+1));
|
|
if( eType==0 ){
|
|
aData[iOff++] = 0x00;
|
|
iOff += lsmVarintPut64(&aData[iOff], iPtr);
|
|
iOff += lsmVarintPut64(&aData[iOff], iKeyPg);
|
|
}else{
|
|
aData[iOff++] = eType;
|
|
iOff += lsmVarintPut64(&aData[iOff], iPtr);
|
|
iOff += lsmVarintPut32(&aData[iOff], nKey);
|
|
memcpy(&aData[iOff], pKey, nKey);
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
static int mergeWorkerBtreeIndirect(MergeWorker *pMW){
|
|
int rc = LSM_OK;
|
|
if( pMW->iIndirect ){
|
|
LsmPgno iKeyPg = pMW->aSave[1].iPgno;
|
|
rc = mergeWorkerBtreeWrite(pMW, 0, pMW->iIndirect, iKeyPg, 0, 0);
|
|
pMW->iIndirect = 0;
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Append the database key (iTopic/pKey/nKey) to the b-tree under
|
|
** construction. This key has not yet been written to a segment page.
|
|
** The pointer that will accompany the new key in the b-tree - that
|
|
** points to the completed segment page that contains keys smaller than
|
|
** (pKey/nKey) is currently stored in pMW->aSave[0].iPgno.
|
|
*/
|
|
static int mergeWorkerPushHierarchy(
|
|
MergeWorker *pMW, /* Merge worker object */
|
|
int iTopic, /* Topic value for this key */
|
|
void *pKey, /* Pointer to key buffer */
|
|
int nKey /* Size of pKey buffer in bytes */
|
|
){
|
|
int rc = LSM_OK; /* Return Code */
|
|
LsmPgno iPtr; /* Pointer value to accompany pKey/nKey */
|
|
|
|
assert( pMW->aSave[0].bStore==0 );
|
|
assert( pMW->aSave[1].bStore==0 );
|
|
rc = mergeWorkerBtreeIndirect(pMW);
|
|
|
|
/* Obtain the absolute pointer value to store along with the key in the
|
|
** page body. This pointer points to a page that contains keys that are
|
|
** smaller than pKey/nKey. */
|
|
iPtr = pMW->aSave[0].iPgno;
|
|
assert( iPtr!=0 );
|
|
|
|
/* Determine if the indirect format should be used. */
|
|
if( (nKey*4 > lsmFsPageSize(pMW->pDb->pFS)) ){
|
|
pMW->iIndirect = iPtr;
|
|
pMW->aSave[1].bStore = 1;
|
|
}else{
|
|
rc = mergeWorkerBtreeWrite(
|
|
pMW, (u8)(iTopic | LSM_SEPARATOR), iPtr, 0, pKey, nKey
|
|
);
|
|
}
|
|
|
|
/* Ensure that the SortedRun.iRoot field is correct. */
|
|
return rc;
|
|
}
|
|
|
|
static int mergeWorkerFinishHierarchy(
|
|
MergeWorker *pMW /* Merge worker object */
|
|
){
|
|
int i; /* Used to loop through apHier[] */
|
|
int rc = LSM_OK; /* Return code */
|
|
LsmPgno iPtr; /* New right-hand-child pointer value */
|
|
|
|
iPtr = pMW->aSave[0].iPgno;
|
|
for(i=0; i<pMW->hier.nHier && rc==LSM_OK; i++){
|
|
Page *pPg = pMW->hier.apHier[i];
|
|
int nData; /* Size of aData[] in bytes */
|
|
u8 *aData; /* Page data for pPg */
|
|
|
|
aData = fsPageData(pPg, &nData);
|
|
lsmPutU64(&aData[SEGMENT_POINTER_OFFSET(nData)], iPtr);
|
|
|
|
rc = lsmFsPagePersist(pPg);
|
|
iPtr = lsmFsPageNumber(pPg);
|
|
lsmFsPageRelease(pPg);
|
|
}
|
|
|
|
if( pMW->hier.nHier ){
|
|
pMW->pLevel->lhs.iRoot = iPtr;
|
|
lsmFree(pMW->pDb->pEnv, pMW->hier.apHier);
|
|
pMW->hier.apHier = 0;
|
|
pMW->hier.nHier = 0;
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
static int mergeWorkerAddPadding(
|
|
MergeWorker *pMW /* Merge worker object */
|
|
){
|
|
FileSystem *pFS = pMW->pDb->pFS;
|
|
return lsmFsSortedPadding(pFS, pMW->pDb->pWorker, &pMW->pLevel->lhs);
|
|
}
|
|
|
|
/*
|
|
** Release all page references currently held by the merge-worker passed
|
|
** as the only argument. Unless an error has occurred, all pages have
|
|
** already been released.
|
|
*/
|
|
static void mergeWorkerReleaseAll(MergeWorker *pMW){
|
|
int i;
|
|
lsmFsPageRelease(pMW->pPage);
|
|
pMW->pPage = 0;
|
|
|
|
for(i=0; i<pMW->hier.nHier; i++){
|
|
lsmFsPageRelease(pMW->hier.apHier[i]);
|
|
pMW->hier.apHier[i] = 0;
|
|
}
|
|
lsmFree(pMW->pDb->pEnv, pMW->hier.apHier);
|
|
pMW->hier.apHier = 0;
|
|
pMW->hier.nHier = 0;
|
|
}
|
|
|
|
static int keyszToSkip(FileSystem *pFS, int nKey){
|
|
int nPgsz; /* Nominal database page size */
|
|
nPgsz = lsmFsPageSize(pFS);
|
|
return LSM_MIN(((nKey * 4) / nPgsz), 3);
|
|
}
|
|
|
|
/*
|
|
** Release the reference to the current output page of merge-worker *pMW
|
|
** (reference pMW->pPage). Set the page number values in aSave[] as
|
|
** required (see comments above struct MergeWorker for details).
|
|
*/
|
|
static int mergeWorkerPersistAndRelease(MergeWorker *pMW){
|
|
int rc;
|
|
int i;
|
|
|
|
assert( pMW->pPage || (pMW->aSave[0].bStore==0 && pMW->aSave[1].bStore==0) );
|
|
|
|
/* Persist the page */
|
|
rc = lsmFsPagePersist(pMW->pPage);
|
|
|
|
/* If required, save the page number. */
|
|
for(i=0; i<2; i++){
|
|
if( pMW->aSave[i].bStore ){
|
|
pMW->aSave[i].iPgno = lsmFsPageNumber(pMW->pPage);
|
|
pMW->aSave[i].bStore = 0;
|
|
}
|
|
}
|
|
|
|
/* Release the completed output page. */
|
|
lsmFsPageRelease(pMW->pPage);
|
|
pMW->pPage = 0;
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Advance to the next page of an output run being populated by merge-worker
|
|
** pMW. The footer of the new page is initialized to indicate that it contains
|
|
** zero records. The flags field is cleared. The page footer pointer field
|
|
** is set to iFPtr.
|
|
**
|
|
** If successful, LSM_OK is returned. Otherwise, an error code.
|
|
*/
|
|
static int mergeWorkerNextPage(
|
|
MergeWorker *pMW, /* Merge worker object to append page to */
|
|
LsmPgno iFPtr /* Pointer value for footer of new page */
|
|
){
|
|
int rc = LSM_OK; /* Return code */
|
|
Page *pNext = 0; /* New page appended to run */
|
|
lsm_db *pDb = pMW->pDb; /* Database handle */
|
|
|
|
rc = lsmFsSortedAppend(pDb->pFS, pDb->pWorker, pMW->pLevel, 0, &pNext);
|
|
assert( rc || pMW->pLevel->lhs.iFirst>0 || pMW->pDb->compress.xCompress );
|
|
|
|
if( rc==LSM_OK ){
|
|
u8 *aData; /* Data buffer belonging to page pNext */
|
|
int nData; /* Size of aData[] in bytes */
|
|
|
|
rc = mergeWorkerPersistAndRelease(pMW);
|
|
|
|
pMW->pPage = pNext;
|
|
pMW->pLevel->pMerge->iOutputOff = 0;
|
|
aData = fsPageData(pNext, &nData);
|
|
lsmPutU16(&aData[SEGMENT_NRECORD_OFFSET(nData)], 0);
|
|
lsmPutU16(&aData[SEGMENT_FLAGS_OFFSET(nData)], 0);
|
|
lsmPutU64(&aData[SEGMENT_POINTER_OFFSET(nData)], iFPtr);
|
|
pMW->nWork++;
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Write a blob of data into an output segment being populated by a
|
|
** merge-worker object. If argument bSep is true, write into the separators
|
|
** array. Otherwise, the main array.
|
|
**
|
|
** This function is used to write the blobs of data for keys and values.
|
|
*/
|
|
static int mergeWorkerData(
|
|
MergeWorker *pMW, /* Merge worker object */
|
|
int bSep, /* True to write to separators run */
|
|
LsmPgno iFPtr, /* Footer ptr for new pages */
|
|
u8 *aWrite, /* Write data from this buffer */
|
|
int nWrite /* Size of aWrite[] in bytes */
|
|
){
|
|
int rc = LSM_OK; /* Return code */
|
|
int nRem = nWrite; /* Number of bytes still to write */
|
|
|
|
while( rc==LSM_OK && nRem>0 ){
|
|
Merge *pMerge = pMW->pLevel->pMerge;
|
|
int nCopy; /* Number of bytes to copy */
|
|
u8 *aData; /* Pointer to buffer of current output page */
|
|
int nData; /* Size of aData[] in bytes */
|
|
int nRec; /* Number of records on current output page */
|
|
int iOff; /* Offset in aData[] to write to */
|
|
|
|
assert( lsmFsPageWritable(pMW->pPage) );
|
|
|
|
aData = fsPageData(pMW->pPage, &nData);
|
|
nRec = pageGetNRec(aData, nData);
|
|
iOff = pMerge->iOutputOff;
|
|
nCopy = LSM_MIN(nRem, SEGMENT_EOF(nData, nRec) - iOff);
|
|
|
|
memcpy(&aData[iOff], &aWrite[nWrite-nRem], nCopy);
|
|
nRem -= nCopy;
|
|
|
|
if( nRem>0 ){
|
|
rc = mergeWorkerNextPage(pMW, iFPtr);
|
|
}else{
|
|
pMerge->iOutputOff = iOff + nCopy;
|
|
}
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
|
|
/*
|
|
** The MergeWorker passed as the only argument is working to merge two or
|
|
** more existing segments together (not to flush an in-memory tree). It
|
|
** has not yet written the first key to the first page of the output.
|
|
*/
|
|
static int mergeWorkerFirstPage(MergeWorker *pMW){
|
|
int rc = LSM_OK; /* Return code */
|
|
Page *pPg = 0; /* First page of run pSeg */
|
|
LsmPgno iFPtr = 0; /* Pointer value read from footer of pPg */
|
|
MultiCursor *pCsr = pMW->pCsr;
|
|
|
|
assert( pMW->pPage==0 );
|
|
|
|
if( pCsr->pBtCsr ){
|
|
rc = LSM_OK;
|
|
iFPtr = pMW->pLevel->pNext->lhs.iFirst;
|
|
}else if( pCsr->nPtr>0 ){
|
|
Segment *pSeg;
|
|
pSeg = pCsr->aPtr[pCsr->nPtr-1].pSeg;
|
|
rc = lsmFsDbPageGet(pMW->pDb->pFS, pSeg, pSeg->iFirst, &pPg);
|
|
if( rc==LSM_OK ){
|
|
u8 *aData; /* Buffer for page pPg */
|
|
int nData; /* Size of aData[] in bytes */
|
|
aData = fsPageData(pPg, &nData);
|
|
iFPtr = pageGetPtr(aData, nData);
|
|
lsmFsPageRelease(pPg);
|
|
}
|
|
}
|
|
|
|
if( rc==LSM_OK ){
|
|
rc = mergeWorkerNextPage(pMW, iFPtr);
|
|
if( pCsr->pPrevMergePtr ) *pCsr->pPrevMergePtr = iFPtr;
|
|
pMW->aSave[0].bStore = 1;
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
static int mergeWorkerWrite(
|
|
MergeWorker *pMW, /* Merge worker object to write into */
|
|
int eType, /* One of SORTED_SEPARATOR, WRITE or DELETE */
|
|
void *pKey, int nKey, /* Key value */
|
|
void *pVal, int nVal, /* Value value */
|
|
LsmPgno iPtr /* Absolute value of page pointer, or 0 */
|
|
){
|
|
int rc = LSM_OK; /* Return code */
|
|
Merge *pMerge; /* Persistent part of level merge state */
|
|
int nHdr; /* Space required for this record header */
|
|
Page *pPg; /* Page to write to */
|
|
u8 *aData; /* Data buffer for page pWriter->pPage */
|
|
int nData = 0; /* Size of buffer aData[] in bytes */
|
|
int nRec = 0; /* Number of records on page pPg */
|
|
LsmPgno iFPtr = 0; /* Value of pointer in footer of pPg */
|
|
LsmPgno iRPtr = 0; /* Value of pointer written into record */
|
|
int iOff = 0; /* Current write offset within page pPg */
|
|
Segment *pSeg; /* Segment being written */
|
|
int flags = 0; /* If != 0, flags value for page footer */
|
|
int bFirst = 0; /* True for first key of output run */
|
|
|
|
pMerge = pMW->pLevel->pMerge;
|
|
pSeg = &pMW->pLevel->lhs;
|
|
|
|
if( pSeg->iFirst==0 && pMW->pPage==0 ){
|
|
rc = mergeWorkerFirstPage(pMW);
|
|
bFirst = 1;
|
|
}
|
|
pPg = pMW->pPage;
|
|
if( pPg ){
|
|
aData = fsPageData(pPg, &nData);
|
|
nRec = pageGetNRec(aData, nData);
|
|
iFPtr = pageGetPtr(aData, nData);
|
|
iRPtr = iPtr ? (iPtr - iFPtr) : 0;
|
|
}
|
|
|
|
/* Figure out how much space is required by the new record. The space
|
|
** required is divided into two sections: the header and the body. The
|
|
** header consists of the intial varint fields. The body are the blobs
|
|
** of data that correspond to the key and value data. The entire header
|
|
** must be stored on the page. The body may overflow onto the next and
|
|
** subsequent pages.
|
|
**
|
|
** The header space is:
|
|
**
|
|
** 1) record type - 1 byte.
|
|
** 2) Page-pointer-offset - 1 varint
|
|
** 3) Key size - 1 varint
|
|
** 4) Value size - 1 varint (only if LSM_INSERT flag is set)
|
|
*/
|
|
if( rc==LSM_OK ){
|
|
nHdr = 1 + lsmVarintLen64(iRPtr) + lsmVarintLen32(nKey);
|
|
if( rtIsWrite(eType) ) nHdr += lsmVarintLen32(nVal);
|
|
|
|
/* If the entire header will not fit on page pPg, or if page pPg is
|
|
** marked read-only, advance to the next page of the output run. */
|
|
iOff = pMerge->iOutputOff;
|
|
if( iOff<0 || pPg==0 || iOff+nHdr > SEGMENT_EOF(nData, nRec+1) ){
|
|
if( iOff>=0 && pPg ){
|
|
/* Zero any free space on the page */
|
|
assert( aData );
|
|
memset(&aData[iOff], 0, SEGMENT_EOF(nData, nRec)-iOff);
|
|
}
|
|
iFPtr = *pMW->pCsr->pPrevMergePtr;
|
|
iRPtr = iPtr ? (iPtr - iFPtr) : 0;
|
|
iOff = 0;
|
|
nRec = 0;
|
|
rc = mergeWorkerNextPage(pMW, iFPtr);
|
|
pPg = pMW->pPage;
|
|
}
|
|
}
|
|
|
|
/* If this record header will be the first on the page, and the page is
|
|
** not the very first in the entire run, add a copy of the key to the
|
|
** b-tree hierarchy.
|
|
*/
|
|
if( rc==LSM_OK && nRec==0 && bFirst==0 ){
|
|
assert( pMerge->nSkip>=0 );
|
|
|
|
if( pMerge->nSkip==0 ){
|
|
rc = mergeWorkerPushHierarchy(pMW, rtTopic(eType), pKey, nKey);
|
|
assert( pMW->aSave[0].bStore==0 );
|
|
pMW->aSave[0].bStore = 1;
|
|
pMerge->nSkip = keyszToSkip(pMW->pDb->pFS, nKey);
|
|
}else{
|
|
pMerge->nSkip--;
|
|
flags = PGFTR_SKIP_THIS_FLAG;
|
|
}
|
|
|
|
if( pMerge->nSkip ) flags |= PGFTR_SKIP_NEXT_FLAG;
|
|
}
|
|
|
|
/* Update the output segment */
|
|
if( rc==LSM_OK ){
|
|
aData = fsPageData(pPg, &nData);
|
|
|
|
/* Update the page footer. */
|
|
lsmPutU16(&aData[SEGMENT_NRECORD_OFFSET(nData)], (u16)(nRec+1));
|
|
lsmPutU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, nRec)], (u16)iOff);
|
|
if( flags ) lsmPutU16(&aData[SEGMENT_FLAGS_OFFSET(nData)], (u16)flags);
|
|
|
|
/* Write the entry header into the current page. */
|
|
aData[iOff++] = (u8)eType; /* 1 */
|
|
iOff += lsmVarintPut64(&aData[iOff], iRPtr); /* 2 */
|
|
iOff += lsmVarintPut32(&aData[iOff], nKey); /* 3 */
|
|
if( rtIsWrite(eType) ) iOff += lsmVarintPut32(&aData[iOff], nVal); /* 4 */
|
|
pMerge->iOutputOff = iOff;
|
|
|
|
/* Write the key and data into the segment. */
|
|
assert( iFPtr==pageGetPtr(aData, nData) );
|
|
rc = mergeWorkerData(pMW, 0, iFPtr+iRPtr, pKey, nKey);
|
|
if( rc==LSM_OK && rtIsWrite(eType) ){
|
|
if( rc==LSM_OK ){
|
|
rc = mergeWorkerData(pMW, 0, iFPtr+iRPtr, pVal, nVal);
|
|
}
|
|
}
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
|
|
/*
|
|
** Free all resources allocated by mergeWorkerInit().
|
|
*/
|
|
static void mergeWorkerShutdown(MergeWorker *pMW, int *pRc){
|
|
int i; /* Iterator variable */
|
|
int rc = *pRc;
|
|
MultiCursor *pCsr = pMW->pCsr;
|
|
|
|
/* Unless the merge has finished, save the cursor position in the
|
|
** Merge.aInput[] array. See function mergeWorkerInit() for the
|
|
** code to restore a cursor position based on aInput[]. */
|
|
if( rc==LSM_OK && pCsr ){
|
|
Merge *pMerge = pMW->pLevel->pMerge;
|
|
if( lsmMCursorValid(pCsr) ){
|
|
int bBtree = (pCsr->pBtCsr!=0);
|
|
int iPtr;
|
|
|
|
/* pMerge->nInput==0 indicates that this is a FlushTree() operation. */
|
|
assert( pMerge->nInput==0 || pMW->pLevel->nRight>0 );
|
|
assert( pMerge->nInput==0 || pMerge->nInput==(pCsr->nPtr+bBtree) );
|
|
|
|
for(i=0; i<(pMerge->nInput-bBtree); i++){
|
|
SegmentPtr *pPtr = &pCsr->aPtr[i];
|
|
if( pPtr->pPg ){
|
|
pMerge->aInput[i].iPg = lsmFsPageNumber(pPtr->pPg);
|
|
pMerge->aInput[i].iCell = pPtr->iCell;
|
|
}else{
|
|
pMerge->aInput[i].iPg = 0;
|
|
pMerge->aInput[i].iCell = 0;
|
|
}
|
|
}
|
|
if( bBtree && pMerge->nInput ){
|
|
assert( i==pCsr->nPtr );
|
|
btreeCursorPosition(pCsr->pBtCsr, &pMerge->aInput[i]);
|
|
}
|
|
|
|
/* Store the location of the split-key */
|
|
iPtr = pCsr->aTree[1] - CURSOR_DATA_SEGMENT;
|
|
if( iPtr<pCsr->nPtr ){
|
|
pMerge->splitkey = pMerge->aInput[iPtr];
|
|
}else{
|
|
btreeCursorSplitkey(pCsr->pBtCsr, &pMerge->splitkey);
|
|
}
|
|
}
|
|
|
|
/* Zero any free space left on the final page. This helps with
|
|
** compression if using a compression hook. And prevents valgrind
|
|
** from complaining about uninitialized byte passed to write(). */
|
|
if( pMW->pPage ){
|
|
int nData;
|
|
u8 *aData = fsPageData(pMW->pPage, &nData);
|
|
int iOff = pMerge->iOutputOff;
|
|
int iEof = SEGMENT_EOF(nData, pageGetNRec(aData, nData));
|
|
memset(&aData[iOff], 0, iEof - iOff);
|
|
}
|
|
|
|
pMerge->iOutputOff = -1;
|
|
}
|
|
|
|
lsmMCursorClose(pCsr, 0);
|
|
|
|
/* Persist and release the output page. */
|
|
if( rc==LSM_OK ) rc = mergeWorkerPersistAndRelease(pMW);
|
|
if( rc==LSM_OK ) rc = mergeWorkerBtreeIndirect(pMW);
|
|
if( rc==LSM_OK ) rc = mergeWorkerFinishHierarchy(pMW);
|
|
if( rc==LSM_OK ) rc = mergeWorkerAddPadding(pMW);
|
|
lsmFsFlushWaiting(pMW->pDb->pFS, &rc);
|
|
mergeWorkerReleaseAll(pMW);
|
|
|
|
lsmFree(pMW->pDb->pEnv, pMW->aGobble);
|
|
pMW->aGobble = 0;
|
|
pMW->pCsr = 0;
|
|
|
|
*pRc = rc;
|
|
}
|
|
|
|
/*
|
|
** The cursor passed as the first argument is being used as the input for
|
|
** a merge operation. When this function is called, *piFlags contains the
|
|
** database entry flags for the current entry. The entry about to be written
|
|
** to the output.
|
|
**
|
|
** Note that this function only has to work for cursors configured to
|
|
** iterate forwards (not backwards).
|
|
*/
|
|
static void mergeRangeDeletes(MultiCursor *pCsr, int *piVal, int *piFlags){
|
|
int f = *piFlags;
|
|
int iKey = pCsr->aTree[1];
|
|
int i;
|
|
|
|
assert( pCsr->flags & CURSOR_NEXT_OK );
|
|
if( pCsr->flags & CURSOR_IGNORE_DELETE ){
|
|
/* The ignore-delete flag is set when the output of the merge will form
|
|
** the oldest level in the database. In this case there is no point in
|
|
** retaining any range-delete flags. */
|
|
assert( (f & LSM_POINT_DELETE)==0 );
|
|
f &= ~(LSM_START_DELETE|LSM_END_DELETE);
|
|
}else{
|
|
for(i=0; i<(CURSOR_DATA_SEGMENT + pCsr->nPtr); i++){
|
|
if( i!=iKey ){
|
|
int eType;
|
|
void *pKey;
|
|
int nKey;
|
|
int res;
|
|
multiCursorGetKey(pCsr, i, &eType, &pKey, &nKey);
|
|
|
|
if( pKey ){
|
|
res = sortedKeyCompare(pCsr->pDb->xCmp,
|
|
rtTopic(pCsr->eType), pCsr->key.pData, pCsr->key.nData,
|
|
rtTopic(eType), pKey, nKey
|
|
);
|
|
assert( res<=0 );
|
|
if( res==0 ){
|
|
if( (f & (LSM_INSERT|LSM_POINT_DELETE))==0 ){
|
|
if( eType & LSM_INSERT ){
|
|
f |= LSM_INSERT;
|
|
*piVal = i;
|
|
}
|
|
else if( eType & LSM_POINT_DELETE ){
|
|
f |= LSM_POINT_DELETE;
|
|
}
|
|
}
|
|
f |= (eType & (LSM_END_DELETE|LSM_START_DELETE));
|
|
}
|
|
|
|
if( i>iKey && (eType & LSM_END_DELETE) && res<0 ){
|
|
if( f & (LSM_INSERT|LSM_POINT_DELETE) ){
|
|
f |= (LSM_END_DELETE|LSM_START_DELETE);
|
|
}else{
|
|
f = 0;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
assert( (f & LSM_INSERT)==0 || (f & LSM_POINT_DELETE)==0 );
|
|
if( (f & LSM_START_DELETE)
|
|
&& (f & LSM_END_DELETE)
|
|
&& (f & LSM_POINT_DELETE )
|
|
){
|
|
f = 0;
|
|
}
|
|
}
|
|
|
|
*piFlags = f;
|
|
}
|
|
|
|
static int mergeWorkerStep(MergeWorker *pMW){
|
|
lsm_db *pDb = pMW->pDb; /* Database handle */
|
|
MultiCursor *pCsr; /* Cursor to read input data from */
|
|
int rc = LSM_OK; /* Return code */
|
|
int eType; /* SORTED_SEPARATOR, WRITE or DELETE */
|
|
void *pKey; int nKey; /* Key */
|
|
LsmPgno iPtr;
|
|
int iVal;
|
|
|
|
pCsr = pMW->pCsr;
|
|
|
|
/* Pull the next record out of the source cursor. */
|
|
lsmMCursorKey(pCsr, &pKey, &nKey);
|
|
eType = pCsr->eType;
|
|
|
|
/* Figure out if the output record may have a different pointer value
|
|
** than the previous. This is the case if the current key is identical to
|
|
** a key that appears in the lowest level run being merged. If so, set
|
|
** iPtr to the absolute pointer value. If not, leave iPtr set to zero,
|
|
** indicating that the output pointer value should be a copy of the pointer
|
|
** value written with the previous key. */
|
|
iPtr = (pCsr->pPrevMergePtr ? *pCsr->pPrevMergePtr : 0);
|
|
if( pCsr->pBtCsr ){
|
|
BtreeCursor *pBtCsr = pCsr->pBtCsr;
|
|
if( pBtCsr->pKey ){
|
|
int res = rtTopic(pBtCsr->eType) - rtTopic(eType);
|
|
if( res==0 ) res = pDb->xCmp(pBtCsr->pKey, pBtCsr->nKey, pKey, nKey);
|
|
if( 0==res ) iPtr = pBtCsr->iPtr;
|
|
assert( res>=0 );
|
|
}
|
|
}else if( pCsr->nPtr ){
|
|
SegmentPtr *pPtr = &pCsr->aPtr[pCsr->nPtr-1];
|
|
if( pPtr->pPg
|
|
&& 0==pDb->xCmp(pPtr->pKey, pPtr->nKey, pKey, nKey)
|
|
){
|
|
iPtr = pPtr->iPtr+pPtr->iPgPtr;
|
|
}
|
|
}
|
|
|
|
iVal = pCsr->aTree[1];
|
|
mergeRangeDeletes(pCsr, &iVal, &eType);
|
|
|
|
if( eType!=0 ){
|
|
if( pMW->aGobble ){
|
|
int iGobble = pCsr->aTree[1] - CURSOR_DATA_SEGMENT;
|
|
if( iGobble<pCsr->nPtr && iGobble>=0 ){
|
|
SegmentPtr *pGobble = &pCsr->aPtr[iGobble];
|
|
if( (pGobble->flags & PGFTR_SKIP_THIS_FLAG)==0 ){
|
|
pMW->aGobble[iGobble] = lsmFsPageNumber(pGobble->pPg);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* If this is a separator key and we know that the output pointer has not
|
|
** changed, there is no point in writing an output record. Otherwise,
|
|
** proceed. */
|
|
if( rc==LSM_OK && (rtIsSeparator(eType)==0 || iPtr!=0) ){
|
|
/* Write the record into the main run. */
|
|
void *pVal; int nVal;
|
|
rc = multiCursorGetVal(pCsr, iVal, &pVal, &nVal);
|
|
if( pVal && rc==LSM_OK ){
|
|
assert( nVal>=0 );
|
|
rc = sortedBlobSet(pDb->pEnv, &pCsr->val, pVal, nVal);
|
|
pVal = pCsr->val.pData;
|
|
}
|
|
if( rc==LSM_OK ){
|
|
rc = mergeWorkerWrite(pMW, eType, pKey, nKey, pVal, nVal, iPtr);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Advance the cursor to the next input record (assuming one exists). */
|
|
assert( lsmMCursorValid(pMW->pCsr) );
|
|
if( rc==LSM_OK ) rc = lsmMCursorNext(pMW->pCsr);
|
|
|
|
return rc;
|
|
}
|
|
|
|
static int mergeWorkerDone(MergeWorker *pMW){
|
|
return pMW->pCsr==0 || !lsmMCursorValid(pMW->pCsr);
|
|
}
|
|
|
|
static void sortedFreeLevel(lsm_env *pEnv, Level *p){
|
|
if( p ){
|
|
lsmFree(pEnv, p->pSplitKey);
|
|
lsmFree(pEnv, p->pMerge);
|
|
lsmFree(pEnv, p->aRhs);
|
|
lsmFree(pEnv, p);
|
|
}
|
|
}
|
|
|
|
static void sortedInvokeWorkHook(lsm_db *pDb){
|
|
if( pDb->xWork ){
|
|
pDb->xWork(pDb, pDb->pWorkCtx);
|
|
}
|
|
}
|
|
|
|
static int sortedNewToplevel(
|
|
lsm_db *pDb, /* Connection handle */
|
|
int eTree, /* One of the TREE_XXX constants */
|
|
int *pnWrite /* OUT: Number of database pages written */
|
|
){
|
|
int rc = LSM_OK; /* Return Code */
|
|
MultiCursor *pCsr = 0;
|
|
Level *pNext = 0; /* The current top level */
|
|
Level *pNew; /* The new level itself */
|
|
Segment *pLinked = 0; /* Delete separators from this segment */
|
|
Level *pDel = 0; /* Delete this entire level */
|
|
int nWrite = 0; /* Number of database pages written */
|
|
Freelist freelist;
|
|
|
|
if( eTree!=TREE_NONE ){
|
|
rc = lsmShmCacheChunks(pDb, pDb->treehdr.nChunk);
|
|
}
|
|
|
|
assert( pDb->bUseFreelist==0 );
|
|
pDb->pFreelist = &freelist;
|
|
pDb->bUseFreelist = 1;
|
|
memset(&freelist, 0, sizeof(freelist));
|
|
|
|
/* Allocate the new level structure to write to. */
|
|
pNext = lsmDbSnapshotLevel(pDb->pWorker);
|
|
pNew = (Level *)lsmMallocZeroRc(pDb->pEnv, sizeof(Level), &rc);
|
|
if( pNew ){
|
|
pNew->pNext = pNext;
|
|
lsmDbSnapshotSetLevel(pDb->pWorker, pNew);
|
|
}
|
|
|
|
/* Create a cursor to gather the data required by the new segment. The new
|
|
** segment contains everything in the tree and pointers to the next segment
|
|
** in the database (if any). */
|
|
pCsr = multiCursorNew(pDb, &rc);
|
|
if( pCsr ){
|
|
pCsr->pDb = pDb;
|
|
rc = multiCursorVisitFreelist(pCsr);
|
|
if( rc==LSM_OK ){
|
|
rc = multiCursorAddTree(pCsr, pDb->pWorker, eTree);
|
|
}
|
|
if( rc==LSM_OK && pNext && pNext->pMerge==0 ){
|
|
if( (pNext->flags & LEVEL_FREELIST_ONLY) ){
|
|
pDel = pNext;
|
|
pCsr->aPtr = lsmMallocZeroRc(pDb->pEnv, sizeof(SegmentPtr), &rc);
|
|
multiCursorAddOne(pCsr, pNext, &rc);
|
|
}else if( eTree!=TREE_NONE && pNext->lhs.iRoot ){
|
|
pLinked = &pNext->lhs;
|
|
rc = btreeCursorNew(pDb, pLinked, &pCsr->pBtCsr);
|
|
}
|
|
}
|
|
|
|
/* If this will be the only segment in the database, discard any delete
|
|
** markers present in the in-memory tree. */
|
|
if( pNext==0 ){
|
|
multiCursorIgnoreDelete(pCsr);
|
|
}
|
|
}
|
|
|
|
if( rc!=LSM_OK ){
|
|
lsmMCursorClose(pCsr, 0);
|
|
}else{
|
|
LsmPgno iLeftPtr = 0;
|
|
Merge merge; /* Merge object used to create new level */
|
|
MergeWorker mergeworker; /* MergeWorker object for the same purpose */
|
|
|
|
memset(&merge, 0, sizeof(Merge));
|
|
memset(&mergeworker, 0, sizeof(MergeWorker));
|
|
|
|
pNew->pMerge = &merge;
|
|
pNew->flags |= LEVEL_INCOMPLETE;
|
|
mergeworker.pDb = pDb;
|
|
mergeworker.pLevel = pNew;
|
|
mergeworker.pCsr = pCsr;
|
|
pCsr->pPrevMergePtr = &iLeftPtr;
|
|
|
|
/* Mark the separators array for the new level as a "phantom". */
|
|
mergeworker.bFlush = 1;
|
|
|
|
/* Do the work to create the new merged segment on disk */
|
|
if( rc==LSM_OK ) rc = lsmMCursorFirst(pCsr);
|
|
while( rc==LSM_OK && mergeWorkerDone(&mergeworker)==0 ){
|
|
rc = mergeWorkerStep(&mergeworker);
|
|
}
|
|
mergeWorkerShutdown(&mergeworker, &rc);
|
|
assert( rc!=LSM_OK || mergeworker.nWork==0 || pNew->lhs.iFirst );
|
|
if( rc==LSM_OK && pNew->lhs.iFirst ){
|
|
rc = lsmFsSortedFinish(pDb->pFS, &pNew->lhs);
|
|
}
|
|
nWrite = mergeworker.nWork;
|
|
pNew->flags &= ~LEVEL_INCOMPLETE;
|
|
if( eTree==TREE_NONE ){
|
|
pNew->flags |= LEVEL_FREELIST_ONLY;
|
|
}
|
|
pNew->pMerge = 0;
|
|
}
|
|
|
|
if( rc!=LSM_OK || pNew->lhs.iFirst==0 ){
|
|
assert( rc!=LSM_OK || pDb->pWorker->freelist.nEntry==0 );
|
|
lsmDbSnapshotSetLevel(pDb->pWorker, pNext);
|
|
sortedFreeLevel(pDb->pEnv, pNew);
|
|
}else{
|
|
if( pLinked ){
|
|
pLinked->iRoot = 0;
|
|
}else if( pDel ){
|
|
assert( pNew->pNext==pDel );
|
|
pNew->pNext = pDel->pNext;
|
|
lsmFsSortedDelete(pDb->pFS, pDb->pWorker, 1, &pDel->lhs);
|
|
sortedFreeLevel(pDb->pEnv, pDel);
|
|
}
|
|
|
|
#if LSM_LOG_STRUCTURE
|
|
lsmSortedDumpStructure(pDb, pDb->pWorker, LSM_LOG_DATA, 0, "new-toplevel");
|
|
#endif
|
|
|
|
if( freelist.nEntry ){
|
|
Freelist *p = &pDb->pWorker->freelist;
|
|
lsmFree(pDb->pEnv, p->aEntry);
|
|
memcpy(p, &freelist, sizeof(freelist));
|
|
freelist.aEntry = 0;
|
|
}else{
|
|
pDb->pWorker->freelist.nEntry = 0;
|
|
}
|
|
|
|
assertBtreeOk(pDb, &pNew->lhs);
|
|
sortedInvokeWorkHook(pDb);
|
|
}
|
|
|
|
if( pnWrite ) *pnWrite = nWrite;
|
|
pDb->pWorker->nWrite += nWrite;
|
|
pDb->pFreelist = 0;
|
|
pDb->bUseFreelist = 0;
|
|
lsmFree(pDb->pEnv, freelist.aEntry);
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** The nMerge levels in the LSM beginning with pLevel consist of a
|
|
** left-hand-side segment only. Replace these levels with a single new
|
|
** level consisting of a new empty segment on the left-hand-side and the
|
|
** nMerge segments from the replaced levels on the right-hand-side.
|
|
**
|
|
** Also, allocate and populate a Merge object and set Level.pMerge to
|
|
** point to it.
|
|
*/
|
|
static int sortedMergeSetup(
|
|
lsm_db *pDb, /* Database handle */
|
|
Level *pLevel, /* First level to merge */
|
|
int nMerge, /* Merge this many levels together */
|
|
Level **ppNew /* New, merged, level */
|
|
){
|
|
int rc = LSM_OK; /* Return Code */
|
|
Level *pNew; /* New Level object */
|
|
int bUseNext = 0; /* True to link in next separators */
|
|
Merge *pMerge; /* New Merge object */
|
|
int nByte; /* Bytes of space allocated at pMerge */
|
|
|
|
#ifdef LSM_DEBUG
|
|
int iLevel;
|
|
Level *pX = pLevel;
|
|
for(iLevel=0; iLevel<nMerge; iLevel++){
|
|
assert( pX->nRight==0 );
|
|
pX = pX->pNext;
|
|
}
|
|
#endif
|
|
|
|
/* Allocate the new Level object */
|
|
pNew = (Level *)lsmMallocZeroRc(pDb->pEnv, sizeof(Level), &rc);
|
|
if( pNew ){
|
|
pNew->aRhs = (Segment *)lsmMallocZeroRc(pDb->pEnv,
|
|
nMerge * sizeof(Segment), &rc);
|
|
}
|
|
|
|
/* Populate the new Level object */
|
|
if( rc==LSM_OK ){
|
|
Level *pNext = 0; /* Level following pNew */
|
|
int i;
|
|
int bFreeOnly = 1;
|
|
Level *pTopLevel;
|
|
Level *p = pLevel;
|
|
Level **pp;
|
|
pNew->nRight = nMerge;
|
|
pNew->iAge = pLevel->iAge+1;
|
|
for(i=0; i<nMerge; i++){
|
|
assert( p->nRight==0 );
|
|
pNext = p->pNext;
|
|
pNew->aRhs[i] = p->lhs;
|
|
if( (p->flags & LEVEL_FREELIST_ONLY)==0 ) bFreeOnly = 0;
|
|
sortedFreeLevel(pDb->pEnv, p);
|
|
p = pNext;
|
|
}
|
|
|
|
if( bFreeOnly ) pNew->flags |= LEVEL_FREELIST_ONLY;
|
|
|
|
/* Replace the old levels with the new. */
|
|
pTopLevel = lsmDbSnapshotLevel(pDb->pWorker);
|
|
pNew->pNext = p;
|
|
for(pp=&pTopLevel; *pp!=pLevel; pp=&((*pp)->pNext));
|
|
*pp = pNew;
|
|
lsmDbSnapshotSetLevel(pDb->pWorker, pTopLevel);
|
|
|
|
/* Determine whether or not the next separators will be linked in */
|
|
if( pNext && pNext->pMerge==0 && pNext->lhs.iRoot && pNext
|
|
&& (bFreeOnly==0 || (pNext->flags & LEVEL_FREELIST_ONLY))
|
|
){
|
|
bUseNext = 1;
|
|
}
|
|
}
|
|
|
|
/* Allocate the merge object */
|
|
nByte = sizeof(Merge) + sizeof(MergeInput) * (nMerge + bUseNext);
|
|
pMerge = (Merge *)lsmMallocZeroRc(pDb->pEnv, nByte, &rc);
|
|
if( pMerge ){
|
|
pMerge->aInput = (MergeInput *)&pMerge[1];
|
|
pMerge->nInput = nMerge + bUseNext;
|
|
pNew->pMerge = pMerge;
|
|
}
|
|
|
|
*ppNew = pNew;
|
|
return rc;
|
|
}
|
|
|
|
static int mergeWorkerInit(
|
|
lsm_db *pDb, /* Db connection to do merge work */
|
|
Level *pLevel, /* Level to work on merging */
|
|
MergeWorker *pMW /* Object to initialize */
|
|
){
|
|
int rc = LSM_OK; /* Return code */
|
|
Merge *pMerge = pLevel->pMerge; /* Persistent part of merge state */
|
|
MultiCursor *pCsr = 0; /* Cursor opened for pMW */
|
|
Level *pNext = pLevel->pNext; /* Next level in LSM */
|
|
|
|
assert( pDb->pWorker );
|
|
assert( pLevel->pMerge );
|
|
assert( pLevel->nRight>0 );
|
|
|
|
memset(pMW, 0, sizeof(MergeWorker));
|
|
pMW->pDb = pDb;
|
|
pMW->pLevel = pLevel;
|
|
pMW->aGobble = lsmMallocZeroRc(pDb->pEnv, sizeof(LsmPgno)*pLevel->nRight,&rc);
|
|
|
|
/* Create a multi-cursor to read the data to write to the new
|
|
** segment. The new segment contains:
|
|
**
|
|
** 1. Records from LHS of each of the nMerge levels being merged.
|
|
** 2. Separators from either the last level being merged, or the
|
|
** separators attached to the LHS of the following level, or neither.
|
|
**
|
|
** If the new level is the lowest (oldest) in the db, discard any
|
|
** delete keys. Key annihilation.
|
|
*/
|
|
pCsr = multiCursorNew(pDb, &rc);
|
|
if( pCsr ){
|
|
pCsr->flags |= CURSOR_NEXT_OK;
|
|
rc = multiCursorAddRhs(pCsr, pLevel);
|
|
}
|
|
if( rc==LSM_OK && pMerge->nInput > pLevel->nRight ){
|
|
rc = btreeCursorNew(pDb, &pNext->lhs, &pCsr->pBtCsr);
|
|
}else if( pNext ){
|
|
multiCursorReadSeparators(pCsr);
|
|
}else{
|
|
multiCursorIgnoreDelete(pCsr);
|
|
}
|
|
|
|
assert( rc!=LSM_OK || pMerge->nInput==(pCsr->nPtr+(pCsr->pBtCsr!=0)) );
|
|
pMW->pCsr = pCsr;
|
|
|
|
/* Load the b-tree hierarchy into memory. */
|
|
if( rc==LSM_OK ) rc = mergeWorkerLoadHierarchy(pMW);
|
|
if( rc==LSM_OK && pMW->hier.nHier==0 ){
|
|
pMW->aSave[0].iPgno = pLevel->lhs.iFirst;
|
|
}
|
|
|
|
/* Position the cursor. */
|
|
if( rc==LSM_OK ){
|
|
pCsr->pPrevMergePtr = &pMerge->iCurrentPtr;
|
|
if( pLevel->lhs.iFirst==0 ){
|
|
/* The output array is still empty. So position the cursor at the very
|
|
** start of the input. */
|
|
rc = multiCursorEnd(pCsr, 0);
|
|
}else{
|
|
/* The output array is non-empty. Position the cursor based on the
|
|
** page/cell data saved in the Merge.aInput[] array. */
|
|
int i;
|
|
for(i=0; rc==LSM_OK && i<pCsr->nPtr; i++){
|
|
MergeInput *pInput = &pMerge->aInput[i];
|
|
if( pInput->iPg ){
|
|
SegmentPtr *pPtr;
|
|
assert( pCsr->aPtr[i].pPg==0 );
|
|
pPtr = &pCsr->aPtr[i];
|
|
rc = segmentPtrLoadPage(pDb->pFS, pPtr, pInput->iPg);
|
|
if( rc==LSM_OK && pPtr->nCell>0 ){
|
|
rc = segmentPtrLoadCell(pPtr, pInput->iCell);
|
|
}
|
|
}
|
|
}
|
|
|
|
if( rc==LSM_OK && pCsr->pBtCsr ){
|
|
int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp;
|
|
assert( i==pCsr->nPtr );
|
|
rc = btreeCursorRestore(pCsr->pBtCsr, xCmp, &pMerge->aInput[i]);
|
|
}
|
|
|
|
if( rc==LSM_OK ){
|
|
rc = multiCursorSetupTree(pCsr, 0);
|
|
}
|
|
}
|
|
pCsr->flags |= CURSOR_NEXT_OK;
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
static int sortedBtreeGobble(
|
|
lsm_db *pDb, /* Worker connection */
|
|
MultiCursor *pCsr, /* Multi-cursor being used for a merge */
|
|
int iGobble /* pCsr->aPtr[] entry to operate on */
|
|
){
|
|
int rc = LSM_OK;
|
|
if( rtTopic(pCsr->eType)==0 ){
|
|
Segment *pSeg = pCsr->aPtr[iGobble].pSeg;
|
|
LsmPgno *aPg;
|
|
int nPg;
|
|
|
|
/* Seek from the root of the b-tree to the segment leaf that may contain
|
|
** a key equal to the one multi-cursor currently points to. Record the
|
|
** page number of each b-tree page and the leaf. The segment may be
|
|
** gobbled up to (but not including) the first of these page numbers.
|
|
*/
|
|
assert( pSeg->iRoot>0 );
|
|
aPg = lsmMallocZeroRc(pDb->pEnv, sizeof(LsmPgno)*32, &rc);
|
|
if( rc==LSM_OK ){
|
|
rc = seekInBtree(pCsr, pSeg,
|
|
rtTopic(pCsr->eType), pCsr->key.pData, pCsr->key.nData, aPg, 0
|
|
);
|
|
}
|
|
|
|
if( rc==LSM_OK ){
|
|
for(nPg=0; aPg[nPg]; nPg++);
|
|
lsmFsGobble(pDb, pSeg, aPg, nPg);
|
|
}
|
|
|
|
lsmFree(pDb->pEnv, aPg);
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Argument p points to a level of age N. Return the number of levels in
|
|
** the linked list starting at p that have age=N (always at least 1).
|
|
*/
|
|
static int sortedCountLevels(Level *p){
|
|
int iAge = p->iAge;
|
|
int nRet = 0;
|
|
do {
|
|
nRet++;
|
|
p = p->pNext;
|
|
}while( p && p->iAge==iAge );
|
|
return nRet;
|
|
}
|
|
|
|
static int sortedSelectLevel(lsm_db *pDb, int nMerge, Level **ppOut){
|
|
Level *pTopLevel = lsmDbSnapshotLevel(pDb->pWorker);
|
|
int rc = LSM_OK;
|
|
Level *pLevel = 0; /* Output value */
|
|
Level *pBest = 0; /* Best level to work on found so far */
|
|
int nBest; /* Number of segments merged at pBest */
|
|
Level *pThis = 0; /* First in run of levels with age=iAge */
|
|
int nThis = 0; /* Number of levels starting at pThis */
|
|
|
|
assert( nMerge>=1 );
|
|
nBest = LSM_MAX(1, nMerge-1);
|
|
|
|
/* Find the longest contiguous run of levels not currently undergoing a
|
|
** merge with the same age in the structure. Or the level being merged
|
|
** with the largest number of right-hand segments. Work on it. */
|
|
for(pLevel=pTopLevel; pLevel; pLevel=pLevel->pNext){
|
|
if( pLevel->nRight==0 && pThis && pLevel->iAge==pThis->iAge ){
|
|
nThis++;
|
|
}else{
|
|
if( nThis>nBest ){
|
|
if( (pLevel->iAge!=pThis->iAge+1)
|
|
|| (pLevel->nRight==0 && sortedCountLevels(pLevel)<=pDb->nMerge)
|
|
){
|
|
pBest = pThis;
|
|
nBest = nThis;
|
|
}
|
|
}
|
|
if( pLevel->nRight ){
|
|
if( pLevel->nRight>nBest ){
|
|
nBest = pLevel->nRight;
|
|
pBest = pLevel;
|
|
}
|
|
nThis = 0;
|
|
pThis = 0;
|
|
}else{
|
|
pThis = pLevel;
|
|
nThis = 1;
|
|
}
|
|
}
|
|
}
|
|
if( nThis>nBest ){
|
|
assert( pThis );
|
|
pBest = pThis;
|
|
nBest = nThis;
|
|
}
|
|
|
|
if( pBest==0 && nMerge==1 ){
|
|
int nFree = 0;
|
|
int nUsr = 0;
|
|
for(pLevel=pTopLevel; pLevel; pLevel=pLevel->pNext){
|
|
assert( !pLevel->nRight );
|
|
if( pLevel->flags & LEVEL_FREELIST_ONLY ){
|
|
nFree++;
|
|
}else{
|
|
nUsr++;
|
|
}
|
|
}
|
|
if( nUsr>1 ){
|
|
pBest = pTopLevel;
|
|
nBest = nFree + nUsr;
|
|
}
|
|
}
|
|
|
|
if( pBest ){
|
|
if( pBest->nRight==0 ){
|
|
rc = sortedMergeSetup(pDb, pBest, nBest, ppOut);
|
|
}else{
|
|
*ppOut = pBest;
|
|
}
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
static int sortedDbIsFull(lsm_db *pDb){
|
|
Level *pTop = lsmDbSnapshotLevel(pDb->pWorker);
|
|
|
|
if( lsmDatabaseFull(pDb) ) return 1;
|
|
if( pTop && pTop->iAge==0
|
|
&& (pTop->nRight || sortedCountLevels(pTop)>=pDb->nMerge)
|
|
){
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
typedef struct MoveBlockCtx MoveBlockCtx;
|
|
struct MoveBlockCtx {
|
|
int iSeen; /* Previous free block on list */
|
|
int iFrom; /* Total number of blocks in file */
|
|
};
|
|
|
|
static int moveBlockCb(void *pCtx, int iBlk, i64 iSnapshot){
|
|
MoveBlockCtx *p = (MoveBlockCtx *)pCtx;
|
|
assert( p->iFrom==0 );
|
|
if( iBlk==(p->iSeen-1) ){
|
|
p->iSeen = iBlk;
|
|
return 0;
|
|
}
|
|
p->iFrom = p->iSeen-1;
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
** This function is called to further compact a database for which all
|
|
** of the content has already been merged into a single segment. If
|
|
** possible, it moves the contents of a single block from the end of the
|
|
** file to a free-block that lies closer to the start of the file (allowing
|
|
** the file to be eventually truncated).
|
|
*/
|
|
static int sortedMoveBlock(lsm_db *pDb, int *pnWrite){
|
|
Snapshot *p = pDb->pWorker;
|
|
Level *pLvl = lsmDbSnapshotLevel(p);
|
|
int iFrom; /* Block to move */
|
|
int iTo; /* Destination to move block to */
|
|
int rc; /* Return code */
|
|
|
|
MoveBlockCtx sCtx;
|
|
|
|
assert( pLvl->pNext==0 && pLvl->nRight==0 );
|
|
assert( p->redirect.n<=LSM_MAX_BLOCK_REDIRECTS );
|
|
|
|
*pnWrite = 0;
|
|
|
|
/* Check that the redirect array is not already full. If it is, return
|
|
** without moving any database content. */
|
|
if( p->redirect.n>=LSM_MAX_BLOCK_REDIRECTS ) return LSM_OK;
|
|
|
|
/* Find the last block of content in the database file. Do this by
|
|
** traversing the free-list in reverse (descending block number) order.
|
|
** The first block not on the free list is the one that will be moved.
|
|
** Since the db consists of a single segment, there is no ambiguity as
|
|
** to which segment the block belongs to. */
|
|
sCtx.iSeen = p->nBlock+1;
|
|
sCtx.iFrom = 0;
|
|
rc = lsmWalkFreelist(pDb, 1, moveBlockCb, &sCtx);
|
|
if( rc!=LSM_OK || sCtx.iFrom==0 ) return rc;
|
|
iFrom = sCtx.iFrom;
|
|
|
|
/* Find the first free block in the database, ignoring block 1. Block
|
|
** 1 is tricky as it is smaller than the other blocks. */
|
|
rc = lsmBlockAllocate(pDb, iFrom, &iTo);
|
|
if( rc!=LSM_OK || iTo==0 ) return rc;
|
|
assert( iTo!=1 && iTo<iFrom );
|
|
|
|
rc = lsmFsMoveBlock(pDb->pFS, &pLvl->lhs, iTo, iFrom);
|
|
if( rc==LSM_OK ){
|
|
if( p->redirect.a==0 ){
|
|
int nByte = sizeof(struct RedirectEntry) * LSM_MAX_BLOCK_REDIRECTS;
|
|
p->redirect.a = lsmMallocZeroRc(pDb->pEnv, nByte, &rc);
|
|
}
|
|
if( rc==LSM_OK ){
|
|
|
|
/* Check if the block just moved was already redirected. */
|
|
int i;
|
|
for(i=0; i<p->redirect.n; i++){
|
|
if( p->redirect.a[i].iTo==iFrom ) break;
|
|
}
|
|
|
|
if( i==p->redirect.n ){
|
|
/* Block iFrom was not already redirected. Add a new array entry. */
|
|
memmove(&p->redirect.a[1], &p->redirect.a[0],
|
|
sizeof(struct RedirectEntry) * p->redirect.n
|
|
);
|
|
p->redirect.a[0].iFrom = iFrom;
|
|
p->redirect.a[0].iTo = iTo;
|
|
p->redirect.n++;
|
|
}else{
|
|
/* Block iFrom was already redirected. Overwrite existing entry. */
|
|
p->redirect.a[i].iTo = iTo;
|
|
}
|
|
|
|
rc = lsmBlockFree(pDb, iFrom);
|
|
|
|
*pnWrite = lsmFsBlockSize(pDb->pFS) / lsmFsPageSize(pDb->pFS);
|
|
pLvl->lhs.pRedirect = &p->redirect;
|
|
}
|
|
}
|
|
|
|
#if LSM_LOG_STRUCTURE
|
|
if( rc==LSM_OK ){
|
|
char aBuf[64];
|
|
sprintf(aBuf, "move-block %d/%d", p->redirect.n-1, LSM_MAX_BLOCK_REDIRECTS);
|
|
lsmSortedDumpStructure(pDb, pDb->pWorker, LSM_LOG_DATA, 0, aBuf);
|
|
}
|
|
#endif
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
*/
|
|
static int mergeInsertFreelistSegments(
|
|
lsm_db *pDb,
|
|
int nFree,
|
|
MergeWorker *pMW
|
|
){
|
|
int rc = LSM_OK;
|
|
if( nFree>0 ){
|
|
MultiCursor *pCsr = pMW->pCsr;
|
|
Level *pLvl = pMW->pLevel;
|
|
SegmentPtr *aNew1;
|
|
Segment *aNew2;
|
|
|
|
Level *pIter;
|
|
Level *pNext;
|
|
int i = 0;
|
|
|
|
aNew1 = (SegmentPtr *)lsmMallocZeroRc(
|
|
pDb->pEnv, sizeof(SegmentPtr) * (pCsr->nPtr+nFree), &rc
|
|
);
|
|
if( rc ) return rc;
|
|
memcpy(&aNew1[nFree], pCsr->aPtr, sizeof(SegmentPtr)*pCsr->nPtr);
|
|
pCsr->nPtr += nFree;
|
|
lsmFree(pDb->pEnv, pCsr->aTree);
|
|
lsmFree(pDb->pEnv, pCsr->aPtr);
|
|
pCsr->aTree = 0;
|
|
pCsr->aPtr = aNew1;
|
|
|
|
aNew2 = (Segment *)lsmMallocZeroRc(
|
|
pDb->pEnv, sizeof(Segment) * (pLvl->nRight+nFree), &rc
|
|
);
|
|
if( rc ) return rc;
|
|
memcpy(&aNew2[nFree], pLvl->aRhs, sizeof(Segment)*pLvl->nRight);
|
|
pLvl->nRight += nFree;
|
|
lsmFree(pDb->pEnv, pLvl->aRhs);
|
|
pLvl->aRhs = aNew2;
|
|
|
|
for(pIter=pDb->pWorker->pLevel; rc==LSM_OK && pIter!=pLvl; pIter=pNext){
|
|
Segment *pSeg = &pLvl->aRhs[i];
|
|
memcpy(pSeg, &pIter->lhs, sizeof(Segment));
|
|
|
|
pCsr->aPtr[i].pSeg = pSeg;
|
|
pCsr->aPtr[i].pLevel = pLvl;
|
|
rc = segmentPtrEnd(pCsr, &pCsr->aPtr[i], 0);
|
|
|
|
pDb->pWorker->pLevel = pNext = pIter->pNext;
|
|
sortedFreeLevel(pDb->pEnv, pIter);
|
|
i++;
|
|
}
|
|
assert( i==nFree );
|
|
assert( rc!=LSM_OK || pDb->pWorker->pLevel==pLvl );
|
|
|
|
for(i=nFree; i<pCsr->nPtr; i++){
|
|
pCsr->aPtr[i].pSeg = &pLvl->aRhs[i];
|
|
}
|
|
|
|
lsmFree(pDb->pEnv, pMW->aGobble);
|
|
pMW->aGobble = 0;
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
static int sortedWork(
|
|
lsm_db *pDb, /* Database handle. Must be worker. */
|
|
int nWork, /* Number of pages of work to do */
|
|
int nMerge, /* Try to merge this many levels at once */
|
|
int bFlush, /* Set if call is to make room for a flush */
|
|
int *pnWrite /* OUT: Actual number of pages written */
|
|
){
|
|
int rc = LSM_OK; /* Return Code */
|
|
int nRemaining = nWork; /* Units of work to do before returning */
|
|
Snapshot *pWorker = pDb->pWorker;
|
|
|
|
assert( pWorker );
|
|
if( lsmDbSnapshotLevel(pWorker)==0 ) return LSM_OK;
|
|
|
|
while( nRemaining>0 ){
|
|
Level *pLevel = 0;
|
|
|
|
/* Find a level to work on. */
|
|
rc = sortedSelectLevel(pDb, nMerge, &pLevel);
|
|
assert( rc==LSM_OK || pLevel==0 );
|
|
|
|
if( pLevel==0 ){
|
|
int nDone = 0;
|
|
Level *pTopLevel = lsmDbSnapshotLevel(pDb->pWorker);
|
|
if( bFlush==0 && nMerge==1 && pTopLevel && pTopLevel->pNext==0 ){
|
|
rc = sortedMoveBlock(pDb, &nDone);
|
|
}
|
|
nRemaining -= nDone;
|
|
|
|
/* Could not find any work to do. Finished. */
|
|
if( nDone==0 ) break;
|
|
}else{
|
|
int bSave = 0;
|
|
Freelist freelist = {0, 0, 0};
|
|
MergeWorker mergeworker; /* State used to work on the level merge */
|
|
|
|
assert( pDb->bIncrMerge==0 );
|
|
assert( pDb->pFreelist==0 && pDb->bUseFreelist==0 );
|
|
|
|
pDb->bIncrMerge = 1;
|
|
rc = mergeWorkerInit(pDb, pLevel, &mergeworker);
|
|
assert( mergeworker.nWork==0 );
|
|
|
|
while( rc==LSM_OK
|
|
&& 0==mergeWorkerDone(&mergeworker)
|
|
&& (mergeworker.nWork<nRemaining || pDb->bUseFreelist)
|
|
){
|
|
int eType = rtTopic(mergeworker.pCsr->eType);
|
|
rc = mergeWorkerStep(&mergeworker);
|
|
|
|
/* If the cursor now points at the first entry past the end of the
|
|
** user data (i.e. either to EOF or to the first free-list entry
|
|
** that will be added to the run), then check if it is possible to
|
|
** merge in any free-list entries that are either in-memory or in
|
|
** free-list-only blocks. */
|
|
if( rc==LSM_OK && nMerge==1 && eType==0
|
|
&& (rtTopic(mergeworker.pCsr->eType) || mergeWorkerDone(&mergeworker))
|
|
){
|
|
int nFree = 0; /* Number of free-list-only levels to merge */
|
|
Level *pLvl;
|
|
assert( pDb->pFreelist==0 && pDb->bUseFreelist==0 );
|
|
|
|
/* Now check if all levels containing data newer than this one
|
|
** are single-segment free-list only levels. If so, they will be
|
|
** merged in now. */
|
|
for(pLvl=pDb->pWorker->pLevel;
|
|
pLvl!=mergeworker.pLevel && (pLvl->flags & LEVEL_FREELIST_ONLY);
|
|
pLvl=pLvl->pNext
|
|
){
|
|
assert( pLvl->nRight==0 );
|
|
nFree++;
|
|
}
|
|
if( pLvl==mergeworker.pLevel ){
|
|
|
|
rc = mergeInsertFreelistSegments(pDb, nFree, &mergeworker);
|
|
if( rc==LSM_OK ){
|
|
rc = multiCursorVisitFreelist(mergeworker.pCsr);
|
|
}
|
|
if( rc==LSM_OK ){
|
|
rc = multiCursorSetupTree(mergeworker.pCsr, 0);
|
|
pDb->pFreelist = &freelist;
|
|
pDb->bUseFreelist = 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
nRemaining -= LSM_MAX(mergeworker.nWork, 1);
|
|
|
|
if( rc==LSM_OK ){
|
|
/* Check if the merge operation is completely finished. If not,
|
|
** gobble up (declare eligible for recycling) any pages from rhs
|
|
** segments for which the content has been completely merged into
|
|
** the lhs of the level. */
|
|
if( mergeWorkerDone(&mergeworker)==0 ){
|
|
int i;
|
|
for(i=0; i<pLevel->nRight; i++){
|
|
SegmentPtr *pGobble = &mergeworker.pCsr->aPtr[i];
|
|
if( pGobble->pSeg->iRoot ){
|
|
rc = sortedBtreeGobble(pDb, mergeworker.pCsr, i);
|
|
}else if( mergeworker.aGobble[i] ){
|
|
lsmFsGobble(pDb, pGobble->pSeg, &mergeworker.aGobble[i], 1);
|
|
}
|
|
}
|
|
}else{
|
|
int i;
|
|
int bEmpty;
|
|
mergeWorkerShutdown(&mergeworker, &rc);
|
|
bEmpty = (pLevel->lhs.iFirst==0);
|
|
|
|
if( bEmpty==0 && rc==LSM_OK ){
|
|
rc = lsmFsSortedFinish(pDb->pFS, &pLevel->lhs);
|
|
}
|
|
|
|
if( pDb->bUseFreelist ){
|
|
Freelist *p = &pDb->pWorker->freelist;
|
|
lsmFree(pDb->pEnv, p->aEntry);
|
|
memcpy(p, &freelist, sizeof(freelist));
|
|
pDb->bUseFreelist = 0;
|
|
pDb->pFreelist = 0;
|
|
bSave = 1;
|
|
}
|
|
|
|
for(i=0; i<pLevel->nRight; i++){
|
|
lsmFsSortedDelete(pDb->pFS, pWorker, 1, &pLevel->aRhs[i]);
|
|
}
|
|
|
|
if( bEmpty ){
|
|
/* If the new level is completely empty, remove it from the
|
|
** database snapshot. This can only happen if all input keys were
|
|
** annihilated. Since keys are only annihilated if the new level
|
|
** is the last in the linked list (contains the most ancient of
|
|
** database content), this guarantees that pLevel->pNext==0. */
|
|
Level *pTop; /* Top level of worker snapshot */
|
|
Level **pp; /* Read/write iterator for Level.pNext list */
|
|
|
|
assert( pLevel->pNext==0 );
|
|
|
|
/* Remove the level from the worker snapshot. */
|
|
pTop = lsmDbSnapshotLevel(pWorker);
|
|
for(pp=&pTop; *pp!=pLevel; pp=&((*pp)->pNext));
|
|
*pp = pLevel->pNext;
|
|
lsmDbSnapshotSetLevel(pWorker, pTop);
|
|
|
|
/* Free the Level structure. */
|
|
sortedFreeLevel(pDb->pEnv, pLevel);
|
|
}else{
|
|
|
|
/* Free the separators of the next level, if required. */
|
|
if( pLevel->pMerge->nInput > pLevel->nRight ){
|
|
assert( pLevel->pNext->lhs.iRoot );
|
|
pLevel->pNext->lhs.iRoot = 0;
|
|
}
|
|
|
|
/* Zero the right-hand-side of pLevel */
|
|
lsmFree(pDb->pEnv, pLevel->aRhs);
|
|
pLevel->nRight = 0;
|
|
pLevel->aRhs = 0;
|
|
|
|
/* Free the Merge object */
|
|
lsmFree(pDb->pEnv, pLevel->pMerge);
|
|
pLevel->pMerge = 0;
|
|
}
|
|
|
|
if( bSave && rc==LSM_OK ){
|
|
pDb->bIncrMerge = 0;
|
|
rc = lsmSaveWorker(pDb, 0);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Clean up the MergeWorker object initialized above. If no error
|
|
** has occurred, invoke the work-hook to inform the application that
|
|
** the database structure has changed. */
|
|
mergeWorkerShutdown(&mergeworker, &rc);
|
|
pDb->bIncrMerge = 0;
|
|
if( rc==LSM_OK ) sortedInvokeWorkHook(pDb);
|
|
|
|
#if LSM_LOG_STRUCTURE
|
|
lsmSortedDumpStructure(pDb, pDb->pWorker, LSM_LOG_DATA, 0, "work");
|
|
#endif
|
|
assertBtreeOk(pDb, &pLevel->lhs);
|
|
assertRunInOrder(pDb, &pLevel->lhs);
|
|
|
|
/* If bFlush is true and the database is no longer considered "full",
|
|
** break out of the loop even if nRemaining is still greater than
|
|
** zero. The caller has an in-memory tree to flush to disk. */
|
|
if( bFlush && sortedDbIsFull(pDb)==0 ) break;
|
|
}
|
|
}
|
|
|
|
if( pnWrite ) *pnWrite = (nWork - nRemaining);
|
|
pWorker->nWrite += (nWork - nRemaining);
|
|
|
|
#ifdef LSM_LOG_WORK
|
|
lsmLogMessage(pDb, rc, "sortedWork(): %d pages", (nWork-nRemaining));
|
|
#endif
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** The database connection passed as the first argument must be a worker
|
|
** connection. This function checks if there exists an "old" in-memory tree
|
|
** ready to be flushed to disk. If so, true is returned. Otherwise false.
|
|
**
|
|
** If an error occurs, *pRc is set to an LSM error code before returning.
|
|
** It is assumed that *pRc is set to LSM_OK when this function is called.
|
|
*/
|
|
static int sortedTreeHasOld(lsm_db *pDb, int *pRc){
|
|
int rc = LSM_OK;
|
|
int bRet = 0;
|
|
|
|
assert( pDb->pWorker );
|
|
if( *pRc==LSM_OK ){
|
|
if( rc==LSM_OK
|
|
&& pDb->treehdr.iOldShmid
|
|
&& pDb->treehdr.iOldLog!=pDb->pWorker->iLogOff
|
|
){
|
|
bRet = 1;
|
|
}else{
|
|
bRet = 0;
|
|
}
|
|
*pRc = rc;
|
|
}
|
|
assert( *pRc==LSM_OK || bRet==0 );
|
|
return bRet;
|
|
}
|
|
|
|
/*
|
|
** Create a new free-list only top-level segment. Return LSM_OK if successful
|
|
** or an LSM error code if some error occurs.
|
|
*/
|
|
static int sortedNewFreelistOnly(lsm_db *pDb){
|
|
return sortedNewToplevel(pDb, TREE_NONE, 0);
|
|
}
|
|
|
|
int lsmSaveWorker(lsm_db *pDb, int bFlush){
|
|
Snapshot *p = pDb->pWorker;
|
|
if( p->freelist.nEntry>pDb->nMaxFreelist ){
|
|
int rc = sortedNewFreelistOnly(pDb);
|
|
if( rc!=LSM_OK ) return rc;
|
|
}
|
|
return lsmCheckpointSaveWorker(pDb, bFlush);
|
|
}
|
|
|
|
static int doLsmSingleWork(
|
|
lsm_db *pDb,
|
|
int bShutdown,
|
|
int nMerge, /* Minimum segments to merge together */
|
|
int nPage, /* Number of pages to write to disk */
|
|
int *pnWrite, /* OUT: Pages actually written to disk */
|
|
int *pbCkpt /* OUT: True if an auto-checkpoint is req. */
|
|
){
|
|
Snapshot *pWorker; /* Worker snapshot */
|
|
int rc = LSM_OK; /* Return code */
|
|
int bDirty = 0;
|
|
int nMax = nPage; /* Maximum pages to write to disk */
|
|
int nRem = nPage;
|
|
int bCkpt = 0;
|
|
|
|
assert( nPage>0 );
|
|
|
|
/* Open the worker 'transaction'. It will be closed before this function
|
|
** returns. */
|
|
assert( pDb->pWorker==0 );
|
|
rc = lsmBeginWork(pDb);
|
|
if( rc!=LSM_OK ) return rc;
|
|
pWorker = pDb->pWorker;
|
|
|
|
/* If this connection is doing auto-checkpoints, set nMax (and nRem) so
|
|
** that this call stops writing when the auto-checkpoint is due. The
|
|
** caller will do the checkpoint, then possibly call this function again. */
|
|
if( bShutdown==0 && pDb->nAutockpt ){
|
|
u32 nSync;
|
|
u32 nUnsync;
|
|
int nPgsz;
|
|
|
|
lsmCheckpointSynced(pDb, 0, 0, &nSync);
|
|
nUnsync = lsmCheckpointNWrite(pDb->pShmhdr->aSnap1, 0);
|
|
nPgsz = lsmCheckpointPgsz(pDb->pShmhdr->aSnap1);
|
|
|
|
nMax = (int)LSM_MIN(nMax, (pDb->nAutockpt/nPgsz) - (int)(nUnsync-nSync));
|
|
if( nMax<nRem ){
|
|
bCkpt = 1;
|
|
nRem = LSM_MAX(nMax, 0);
|
|
}
|
|
}
|
|
|
|
/* If there exists in-memory data ready to be flushed to disk, attempt
|
|
** to flush it now. */
|
|
if( pDb->nTransOpen==0 ){
|
|
rc = lsmTreeLoadHeader(pDb, 0);
|
|
}
|
|
if( sortedTreeHasOld(pDb, &rc) ){
|
|
/* sortedDbIsFull() returns non-zero if either (a) there are too many
|
|
** levels in total in the db, or (b) there are too many levels with the
|
|
** the same age in the db. Either way, call sortedWork() to merge
|
|
** existing segments together until this condition is cleared. */
|
|
if( sortedDbIsFull(pDb) ){
|
|
int nPg = 0;
|
|
rc = sortedWork(pDb, nRem, nMerge, 1, &nPg);
|
|
nRem -= nPg;
|
|
assert( rc!=LSM_OK || nRem<=0 || !sortedDbIsFull(pDb) );
|
|
bDirty = 1;
|
|
}
|
|
|
|
if( rc==LSM_OK && nRem>0 ){
|
|
int nPg = 0;
|
|
rc = sortedNewToplevel(pDb, TREE_OLD, &nPg);
|
|
nRem -= nPg;
|
|
if( rc==LSM_OK ){
|
|
if( pDb->nTransOpen>0 ){
|
|
lsmTreeDiscardOld(pDb);
|
|
}
|
|
rc = lsmSaveWorker(pDb, 1);
|
|
bDirty = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* If nPage is still greater than zero, do some merging. */
|
|
if( rc==LSM_OK && nRem>0 && bShutdown==0 ){
|
|
int nPg = 0;
|
|
rc = sortedWork(pDb, nRem, nMerge, 0, &nPg);
|
|
nRem -= nPg;
|
|
if( nPg ) bDirty = 1;
|
|
}
|
|
|
|
/* If the in-memory part of the free-list is too large, write a new
|
|
** top-level containing just the in-memory free-list entries to disk. */
|
|
if( rc==LSM_OK && pDb->pWorker->freelist.nEntry > pDb->nMaxFreelist ){
|
|
while( rc==LSM_OK && lsmDatabaseFull(pDb) ){
|
|
int nPg = 0;
|
|
rc = sortedWork(pDb, 16, nMerge, 1, &nPg);
|
|
nRem -= nPg;
|
|
}
|
|
if( rc==LSM_OK ){
|
|
rc = sortedNewFreelistOnly(pDb);
|
|
}
|
|
bDirty = 1;
|
|
}
|
|
|
|
if( rc==LSM_OK ){
|
|
*pnWrite = (nMax - nRem);
|
|
*pbCkpt = (bCkpt && nRem<=0);
|
|
if( nMerge==1 && pDb->nAutockpt>0 && *pnWrite>0
|
|
&& pWorker->pLevel
|
|
&& pWorker->pLevel->nRight==0
|
|
&& pWorker->pLevel->pNext==0
|
|
){
|
|
*pbCkpt = 1;
|
|
}
|
|
}
|
|
|
|
if( rc==LSM_OK && bDirty ){
|
|
lsmFinishWork(pDb, 0, &rc);
|
|
}else{
|
|
int rcdummy = LSM_BUSY;
|
|
lsmFinishWork(pDb, 0, &rcdummy);
|
|
*pnWrite = 0;
|
|
}
|
|
assert( pDb->pWorker==0 );
|
|
return rc;
|
|
}
|
|
|
|
static int doLsmWork(lsm_db *pDb, int nMerge, int nPage, int *pnWrite){
|
|
int rc = LSM_OK; /* Return code */
|
|
int nWrite = 0; /* Number of pages written */
|
|
|
|
assert( nMerge>=1 );
|
|
|
|
if( nPage!=0 ){
|
|
int bCkpt = 0;
|
|
do {
|
|
int nThis = 0;
|
|
int nReq = (nPage>=0) ? (nPage-nWrite) : ((int)0x7FFFFFFF);
|
|
|
|
bCkpt = 0;
|
|
rc = doLsmSingleWork(pDb, 0, nMerge, nReq, &nThis, &bCkpt);
|
|
nWrite += nThis;
|
|
if( rc==LSM_OK && bCkpt ){
|
|
rc = lsm_checkpoint(pDb, 0);
|
|
}
|
|
}while( rc==LSM_OK && bCkpt && (nWrite<nPage || nPage<0) );
|
|
}
|
|
|
|
if( pnWrite ){
|
|
if( rc==LSM_OK ){
|
|
*pnWrite = nWrite;
|
|
}else{
|
|
*pnWrite = 0;
|
|
}
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Perform work to merge database segments together.
|
|
*/
|
|
int lsm_work(lsm_db *pDb, int nMerge, int nKB, int *pnWrite){
|
|
int rc; /* Return code */
|
|
int nPgsz; /* Nominal page size in bytes */
|
|
int nPage; /* Equivalent of nKB in pages */
|
|
int nWrite = 0; /* Number of pages written */
|
|
|
|
/* This function may not be called if pDb has an open read or write
|
|
** transaction. Return LSM_MISUSE if an application attempts this. */
|
|
if( pDb->nTransOpen || pDb->pCsr ) return LSM_MISUSE_BKPT;
|
|
if( nMerge<=0 ) nMerge = pDb->nMerge;
|
|
|
|
lsmFsPurgeCache(pDb->pFS);
|
|
|
|
/* Convert from KB to pages */
|
|
nPgsz = lsmFsPageSize(pDb->pFS);
|
|
if( nKB>=0 ){
|
|
nPage = ((i64)nKB * 1024 + nPgsz - 1) / nPgsz;
|
|
}else{
|
|
nPage = -1;
|
|
}
|
|
|
|
rc = doLsmWork(pDb, nMerge, nPage, &nWrite);
|
|
|
|
if( pnWrite ){
|
|
/* Convert back from pages to KB */
|
|
*pnWrite = (int)(((i64)nWrite * 1024 + nPgsz - 1) / nPgsz);
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
int lsm_flush(lsm_db *db){
|
|
int rc;
|
|
|
|
if( db->nTransOpen>0 || db->pCsr ){
|
|
rc = LSM_MISUSE_BKPT;
|
|
}else{
|
|
rc = lsmBeginWriteTrans(db);
|
|
if( rc==LSM_OK ){
|
|
lsmFlushTreeToDisk(db);
|
|
lsmTreeDiscardOld(db);
|
|
lsmTreeMakeOld(db);
|
|
lsmTreeDiscardOld(db);
|
|
}
|
|
|
|
if( rc==LSM_OK ){
|
|
rc = lsmFinishWriteTrans(db, 1);
|
|
}else{
|
|
lsmFinishWriteTrans(db, 0);
|
|
}
|
|
lsmFinishReadTrans(db);
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** This function is called in auto-work mode to perform merging work on
|
|
** the data structure. It performs enough merging work to prevent the
|
|
** height of the tree from growing indefinitely assuming that roughly
|
|
** nUnit database pages worth of data have been written to the database
|
|
** (i.e. the in-memory tree) since the last call.
|
|
*/
|
|
int lsmSortedAutoWork(
|
|
lsm_db *pDb, /* Database handle */
|
|
int nUnit /* Pages of data written to in-memory tree */
|
|
){
|
|
int rc = LSM_OK; /* Return code */
|
|
int nDepth = 0; /* Current height of tree (longest path) */
|
|
Level *pLevel; /* Used to iterate through levels */
|
|
int bRestore = 0;
|
|
|
|
assert( pDb->pWorker==0 );
|
|
assert( pDb->nTransOpen>0 );
|
|
|
|
/* Determine how many units of work to do before returning. One unit of
|
|
** work is achieved by writing one page (~4KB) of merged data. */
|
|
for(pLevel=lsmDbSnapshotLevel(pDb->pClient); pLevel; pLevel=pLevel->pNext){
|
|
/* nDepth += LSM_MAX(1, pLevel->nRight); */
|
|
nDepth += 1;
|
|
}
|
|
if( lsmTreeHasOld(pDb) ){
|
|
nDepth += 1;
|
|
bRestore = 1;
|
|
rc = lsmSaveCursors(pDb);
|
|
if( rc!=LSM_OK ) return rc;
|
|
}
|
|
|
|
if( nDepth>0 ){
|
|
int nRemaining; /* Units of work to do before returning */
|
|
|
|
nRemaining = nUnit * nDepth;
|
|
#ifdef LSM_LOG_WORK
|
|
lsmLogMessage(pDb, rc, "lsmSortedAutoWork(): %d*%d = %d pages",
|
|
nUnit, nDepth, nRemaining);
|
|
#endif
|
|
assert( nRemaining>=0 );
|
|
rc = doLsmWork(pDb, pDb->nMerge, nRemaining, 0);
|
|
if( rc==LSM_BUSY ) rc = LSM_OK;
|
|
|
|
if( bRestore && pDb->pCsr ){
|
|
lsmMCursorFreeCache(pDb);
|
|
lsmFreeSnapshot(pDb->pEnv, pDb->pClient);
|
|
pDb->pClient = 0;
|
|
if( rc==LSM_OK ){
|
|
rc = lsmCheckpointLoad(pDb, 0);
|
|
}
|
|
if( rc==LSM_OK ){
|
|
rc = lsmCheckpointDeserialize(pDb, 0, pDb->aSnapshot, &pDb->pClient);
|
|
}
|
|
if( rc==LSM_OK ){
|
|
rc = lsmRestoreCursors(pDb);
|
|
}
|
|
}
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** This function is only called during system shutdown. The contents of
|
|
** any in-memory trees present (old or current) are written out to disk.
|
|
*/
|
|
int lsmFlushTreeToDisk(lsm_db *pDb){
|
|
int rc;
|
|
|
|
rc = lsmBeginWork(pDb);
|
|
while( rc==LSM_OK && sortedDbIsFull(pDb) ){
|
|
rc = sortedWork(pDb, 256, pDb->nMerge, 1, 0);
|
|
}
|
|
|
|
if( rc==LSM_OK ){
|
|
rc = sortedNewToplevel(pDb, TREE_BOTH, 0);
|
|
}
|
|
|
|
lsmFinishWork(pDb, 1, &rc);
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Return a string representation of the segment passed as the only argument.
|
|
** Space for the returned string is allocated using lsmMalloc(), and should
|
|
** be freed by the caller using lsmFree().
|
|
*/
|
|
static char *segToString(lsm_env *pEnv, Segment *pSeg, int nMin){
|
|
LsmPgno nSize = pSeg->nSize;
|
|
LsmPgno iRoot = pSeg->iRoot;
|
|
LsmPgno iFirst = pSeg->iFirst;
|
|
LsmPgno iLast = pSeg->iLastPg;
|
|
char *z;
|
|
|
|
char *z1;
|
|
char *z2;
|
|
int nPad;
|
|
|
|
z1 = lsmMallocPrintf(pEnv, "%d.%d", iFirst, iLast);
|
|
if( iRoot ){
|
|
z2 = lsmMallocPrintf(pEnv, "root=%lld", iRoot);
|
|
}else{
|
|
z2 = lsmMallocPrintf(pEnv, "size=%lld", nSize);
|
|
}
|
|
|
|
nPad = nMin - 2 - strlen(z1) - 1 - strlen(z2);
|
|
nPad = LSM_MAX(0, nPad);
|
|
|
|
if( iRoot ){
|
|
z = lsmMallocPrintf(pEnv, "/%s %*s%s\\", z1, nPad, "", z2);
|
|
}else{
|
|
z = lsmMallocPrintf(pEnv, "|%s %*s%s|", z1, nPad, "", z2);
|
|
}
|
|
lsmFree(pEnv, z1);
|
|
lsmFree(pEnv, z2);
|
|
|
|
return z;
|
|
}
|
|
|
|
static int fileToString(
|
|
lsm_db *pDb, /* For xMalloc() */
|
|
char *aBuf,
|
|
int nBuf,
|
|
int nMin,
|
|
Segment *pSeg
|
|
){
|
|
int i = 0;
|
|
if( pSeg ){
|
|
char *zSeg;
|
|
|
|
zSeg = segToString(pDb->pEnv, pSeg, nMin);
|
|
snprintf(&aBuf[i], nBuf-i, "%s", zSeg);
|
|
i += strlen(&aBuf[i]);
|
|
lsmFree(pDb->pEnv, zSeg);
|
|
|
|
#ifdef LSM_LOG_FREELIST
|
|
lsmInfoArrayStructure(pDb, 1, pSeg->iFirst, &zSeg);
|
|
snprintf(&aBuf[i], nBuf-1, " (%s)", zSeg);
|
|
i += strlen(&aBuf[i]);
|
|
lsmFree(pDb->pEnv, zSeg);
|
|
#endif
|
|
aBuf[nBuf] = 0;
|
|
}else{
|
|
aBuf[0] = '\0';
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
void sortedDumpPage(lsm_db *pDb, Segment *pRun, Page *pPg, int bVals){
|
|
LsmBlob blob = {0, 0, 0}; /* LsmBlob used for keys */
|
|
LsmString s;
|
|
int i;
|
|
|
|
int nRec;
|
|
LsmPgno iPtr;
|
|
int flags;
|
|
u8 *aData;
|
|
int nData;
|
|
|
|
aData = fsPageData(pPg, &nData);
|
|
|
|
nRec = pageGetNRec(aData, nData);
|
|
iPtr = pageGetPtr(aData, nData);
|
|
flags = pageGetFlags(aData, nData);
|
|
|
|
lsmStringInit(&s, pDb->pEnv);
|
|
lsmStringAppendf(&s,"nCell=%d iPtr=%lld flags=%d {", nRec, iPtr, flags);
|
|
if( flags&SEGMENT_BTREE_FLAG ) iPtr = 0;
|
|
|
|
for(i=0; i<nRec; i++){
|
|
Page *pRef = 0; /* Pointer to page iRef */
|
|
int iChar;
|
|
u8 *aKey; int nKey = 0; /* Key */
|
|
u8 *aVal = 0; int nVal = 0; /* Value */
|
|
int iTopic;
|
|
u8 *aCell;
|
|
i64 iPgPtr;
|
|
int eType;
|
|
|
|
aCell = pageGetCell(aData, nData, i);
|
|
eType = *aCell++;
|
|
assert( (flags & SEGMENT_BTREE_FLAG) || eType!=0 );
|
|
aCell += lsmVarintGet64(aCell, &iPgPtr);
|
|
|
|
if( eType==0 ){
|
|
LsmPgno iRef; /* Page number of referenced page */
|
|
aCell += lsmVarintGet64(aCell, &iRef);
|
|
lsmFsDbPageGet(pDb->pFS, pRun, iRef, &pRef);
|
|
aKey = pageGetKey(pRun, pRef, 0, &iTopic, &nKey, &blob);
|
|
}else{
|
|
aCell += lsmVarintGet32(aCell, &nKey);
|
|
if( rtIsWrite(eType) ) aCell += lsmVarintGet32(aCell, &nVal);
|
|
sortedReadData(0, pPg, (aCell-aData), nKey+nVal, (void **)&aKey, &blob);
|
|
aVal = &aKey[nKey];
|
|
iTopic = eType;
|
|
}
|
|
|
|
lsmStringAppendf(&s, "%s%2X:", (i==0?"":" "), iTopic);
|
|
for(iChar=0; iChar<nKey; iChar++){
|
|
lsmStringAppendf(&s, "%c", isalnum(aKey[iChar]) ? aKey[iChar] : '.');
|
|
}
|
|
if( nVal>0 && bVals ){
|
|
lsmStringAppendf(&s, "##");
|
|
for(iChar=0; iChar<nVal; iChar++){
|
|
lsmStringAppendf(&s, "%c", isalnum(aVal[iChar]) ? aVal[iChar] : '.');
|
|
}
|
|
}
|
|
|
|
lsmStringAppendf(&s, " %lld", iPgPtr+iPtr);
|
|
lsmFsPageRelease(pRef);
|
|
}
|
|
lsmStringAppend(&s, "}", 1);
|
|
|
|
lsmLogMessage(pDb, LSM_OK, " Page %d: %s", lsmFsPageNumber(pPg), s.z);
|
|
lsmStringClear(&s);
|
|
|
|
sortedBlobFree(&blob);
|
|
}
|
|
|
|
static void infoCellDump(
|
|
lsm_db *pDb, /* Database handle */
|
|
Segment *pSeg, /* Segment page belongs to */
|
|
int bIndirect, /* True to follow indirect refs */
|
|
Page *pPg,
|
|
int iCell,
|
|
int *peType,
|
|
int *piPgPtr,
|
|
u8 **paKey, int *pnKey,
|
|
u8 **paVal, int *pnVal,
|
|
LsmBlob *pBlob
|
|
){
|
|
u8 *aData; int nData; /* Page data */
|
|
u8 *aKey; int nKey = 0; /* Key */
|
|
u8 *aVal = 0; int nVal = 0; /* Value */
|
|
int eType;
|
|
int iPgPtr;
|
|
Page *pRef = 0; /* Pointer to page iRef */
|
|
u8 *aCell;
|
|
|
|
aData = fsPageData(pPg, &nData);
|
|
|
|
aCell = pageGetCell(aData, nData, iCell);
|
|
eType = *aCell++;
|
|
aCell += lsmVarintGet32(aCell, &iPgPtr);
|
|
|
|
if( eType==0 ){
|
|
int dummy;
|
|
LsmPgno iRef; /* Page number of referenced page */
|
|
aCell += lsmVarintGet64(aCell, &iRef);
|
|
if( bIndirect ){
|
|
lsmFsDbPageGet(pDb->pFS, pSeg, iRef, &pRef);
|
|
pageGetKeyCopy(pDb->pEnv, pSeg, pRef, 0, &dummy, pBlob);
|
|
aKey = (u8 *)pBlob->pData;
|
|
nKey = pBlob->nData;
|
|
lsmFsPageRelease(pRef);
|
|
}else{
|
|
aKey = (u8 *)"<indirect>";
|
|
nKey = 11;
|
|
}
|
|
}else{
|
|
aCell += lsmVarintGet32(aCell, &nKey);
|
|
if( rtIsWrite(eType) ) aCell += lsmVarintGet32(aCell, &nVal);
|
|
sortedReadData(pSeg, pPg, (aCell-aData), nKey+nVal, (void **)&aKey, pBlob);
|
|
aVal = &aKey[nKey];
|
|
}
|
|
|
|
if( peType ) *peType = eType;
|
|
if( piPgPtr ) *piPgPtr = iPgPtr;
|
|
if( paKey ) *paKey = aKey;
|
|
if( paVal ) *paVal = aVal;
|
|
if( pnKey ) *pnKey = nKey;
|
|
if( pnVal ) *pnVal = nVal;
|
|
}
|
|
|
|
static int infoAppendBlob(LsmString *pStr, int bHex, u8 *z, int n){
|
|
int iChar;
|
|
for(iChar=0; iChar<n; iChar++){
|
|
if( bHex ){
|
|
lsmStringAppendf(pStr, "%02X", z[iChar]);
|
|
}else{
|
|
lsmStringAppendf(pStr, "%c", isalnum(z[iChar]) ?z[iChar] : '.');
|
|
}
|
|
}
|
|
return LSM_OK;
|
|
}
|
|
|
|
#define INFO_PAGE_DUMP_DATA 0x01
|
|
#define INFO_PAGE_DUMP_VALUES 0x02
|
|
#define INFO_PAGE_DUMP_HEX 0x04
|
|
#define INFO_PAGE_DUMP_INDIRECT 0x08
|
|
|
|
static int infoPageDump(
|
|
lsm_db *pDb, /* Database handle */
|
|
LsmPgno iPg, /* Page number of page to dump */
|
|
int flags,
|
|
char **pzOut /* OUT: lsmMalloc'd string */
|
|
){
|
|
int rc = LSM_OK; /* Return code */
|
|
Page *pPg = 0; /* Handle for page iPg */
|
|
int i, j; /* Loop counters */
|
|
const int perLine = 16; /* Bytes per line in the raw hex dump */
|
|
Segment *pSeg = 0;
|
|
Snapshot *pSnap;
|
|
|
|
int bValues = (flags & INFO_PAGE_DUMP_VALUES);
|
|
int bHex = (flags & INFO_PAGE_DUMP_HEX);
|
|
int bData = (flags & INFO_PAGE_DUMP_DATA);
|
|
int bIndirect = (flags & INFO_PAGE_DUMP_INDIRECT);
|
|
|
|
*pzOut = 0;
|
|
if( iPg==0 ) return LSM_ERROR;
|
|
|
|
assert( pDb->pClient || pDb->pWorker );
|
|
pSnap = pDb->pClient;
|
|
if( pSnap==0 ) pSnap = pDb->pWorker;
|
|
if( pSnap->redirect.n>0 ){
|
|
Level *pLvl;
|
|
int bUse = 0;
|
|
for(pLvl=pSnap->pLevel; pLvl->pNext; pLvl=pLvl->pNext);
|
|
pSeg = (pLvl->nRight==0 ? &pLvl->lhs : &pLvl->aRhs[pLvl->nRight-1]);
|
|
rc = lsmFsSegmentContainsPg(pDb->pFS, pSeg, iPg, &bUse);
|
|
if( bUse==0 ){
|
|
pSeg = 0;
|
|
}
|
|
}
|
|
|
|
/* iPg is a real page number (not subject to redirection). So it is safe
|
|
** to pass a NULL in place of the segment pointer as the second argument
|
|
** to lsmFsDbPageGet() here. */
|
|
if( rc==LSM_OK ){
|
|
rc = lsmFsDbPageGet(pDb->pFS, 0, iPg, &pPg);
|
|
}
|
|
|
|
if( rc==LSM_OK ){
|
|
LsmBlob blob = {0, 0, 0, 0};
|
|
int nKeyWidth = 0;
|
|
LsmString str;
|
|
int nRec;
|
|
LsmPgno iPtr;
|
|
int flags2;
|
|
int iCell;
|
|
u8 *aData; int nData; /* Page data and size thereof */
|
|
|
|
aData = fsPageData(pPg, &nData);
|
|
nRec = pageGetNRec(aData, nData);
|
|
iPtr = pageGetPtr(aData, nData);
|
|
flags2 = pageGetFlags(aData, nData);
|
|
|
|
lsmStringInit(&str, pDb->pEnv);
|
|
lsmStringAppendf(&str, "Page : %lld (%d bytes)\n", iPg, nData);
|
|
lsmStringAppendf(&str, "nRec : %d\n", nRec);
|
|
lsmStringAppendf(&str, "iPtr : %lld\n", iPtr);
|
|
lsmStringAppendf(&str, "flags: %04x\n", flags2);
|
|
lsmStringAppendf(&str, "\n");
|
|
|
|
for(iCell=0; iCell<nRec; iCell++){
|
|
int nKey;
|
|
infoCellDump(
|
|
pDb, pSeg, bIndirect, pPg, iCell, 0, 0, 0, &nKey, 0, 0, &blob
|
|
);
|
|
if( nKey>nKeyWidth ) nKeyWidth = nKey;
|
|
}
|
|
if( bHex ) nKeyWidth = nKeyWidth * 2;
|
|
|
|
for(iCell=0; iCell<nRec; iCell++){
|
|
u8 *aKey; int nKey = 0; /* Key */
|
|
u8 *aVal; int nVal = 0; /* Value */
|
|
int iPgPtr;
|
|
int eType;
|
|
LsmPgno iAbsPtr;
|
|
char zFlags[8];
|
|
|
|
infoCellDump(pDb, pSeg, bIndirect, pPg, iCell, &eType, &iPgPtr,
|
|
&aKey, &nKey, &aVal, &nVal, &blob
|
|
);
|
|
iAbsPtr = iPgPtr + ((flags2 & SEGMENT_BTREE_FLAG) ? 0 : iPtr);
|
|
|
|
lsmFlagsToString(eType, zFlags);
|
|
lsmStringAppendf(&str, "%s %d (%s) ",
|
|
zFlags, iAbsPtr, (rtTopic(eType) ? "sys" : "usr")
|
|
);
|
|
infoAppendBlob(&str, bHex, aKey, nKey);
|
|
if( nVal>0 && bValues ){
|
|
lsmStringAppendf(&str, "%*s", nKeyWidth - (nKey*(1+bHex)), "");
|
|
lsmStringAppendf(&str, " ");
|
|
infoAppendBlob(&str, bHex, aVal, nVal);
|
|
}
|
|
if( rtTopic(eType) ){
|
|
int iBlk = (int)~lsmGetU32(aKey);
|
|
lsmStringAppendf(&str, " (block=%d", iBlk);
|
|
if( nVal>0 ){
|
|
i64 iSnap = lsmGetU64(aVal);
|
|
lsmStringAppendf(&str, " snapshot=%lld", iSnap);
|
|
}
|
|
lsmStringAppendf(&str, ")");
|
|
}
|
|
lsmStringAppendf(&str, "\n");
|
|
}
|
|
|
|
if( bData ){
|
|
lsmStringAppendf(&str, "\n-------------------"
|
|
"-------------------------------------------------------------\n");
|
|
lsmStringAppendf(&str, "Page %d\n",
|
|
iPg, (iPg-1)*nData, iPg*nData - 1);
|
|
for(i=0; i<nData; i += perLine){
|
|
lsmStringAppendf(&str, "%04x: ", i);
|
|
for(j=0; j<perLine; j++){
|
|
if( i+j>nData ){
|
|
lsmStringAppendf(&str, " ");
|
|
}else{
|
|
lsmStringAppendf(&str, "%02x ", aData[i+j]);
|
|
}
|
|
}
|
|
lsmStringAppendf(&str, " ");
|
|
for(j=0; j<perLine; j++){
|
|
if( i+j>nData ){
|
|
lsmStringAppendf(&str, " ");
|
|
}else{
|
|
lsmStringAppendf(&str,"%c", isprint(aData[i+j]) ? aData[i+j] : '.');
|
|
}
|
|
}
|
|
lsmStringAppendf(&str,"\n");
|
|
}
|
|
}
|
|
|
|
*pzOut = str.z;
|
|
sortedBlobFree(&blob);
|
|
lsmFsPageRelease(pPg);
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
int lsmInfoPageDump(
|
|
lsm_db *pDb, /* Database handle */
|
|
LsmPgno iPg, /* Page number of page to dump */
|
|
int bHex, /* True to output key/value in hex form */
|
|
char **pzOut /* OUT: lsmMalloc'd string */
|
|
){
|
|
int flags = INFO_PAGE_DUMP_DATA | INFO_PAGE_DUMP_VALUES;
|
|
if( bHex ) flags |= INFO_PAGE_DUMP_HEX;
|
|
return infoPageDump(pDb, iPg, flags, pzOut);
|
|
}
|
|
|
|
void sortedDumpSegment(lsm_db *pDb, Segment *pRun, int bVals){
|
|
assert( pDb->xLog );
|
|
if( pRun && pRun->iFirst ){
|
|
int flags = (bVals ? INFO_PAGE_DUMP_VALUES : 0);
|
|
char *zSeg;
|
|
Page *pPg;
|
|
|
|
zSeg = segToString(pDb->pEnv, pRun, 0);
|
|
lsmLogMessage(pDb, LSM_OK, "Segment: %s", zSeg);
|
|
lsmFree(pDb->pEnv, zSeg);
|
|
|
|
lsmFsDbPageGet(pDb->pFS, pRun, pRun->iFirst, &pPg);
|
|
while( pPg ){
|
|
Page *pNext;
|
|
char *z = 0;
|
|
infoPageDump(pDb, lsmFsPageNumber(pPg), flags, &z);
|
|
lsmLogMessage(pDb, LSM_OK, "%s", z);
|
|
lsmFree(pDb->pEnv, z);
|
|
#if 0
|
|
sortedDumpPage(pDb, pRun, pPg, bVals);
|
|
#endif
|
|
lsmFsDbPageNext(pRun, pPg, 1, &pNext);
|
|
lsmFsPageRelease(pPg);
|
|
pPg = pNext;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
** Invoke the log callback zero or more times with messages that describe
|
|
** the current database structure.
|
|
*/
|
|
void lsmSortedDumpStructure(
|
|
lsm_db *pDb, /* Database handle (used for xLog callback) */
|
|
Snapshot *pSnap, /* Snapshot to dump */
|
|
int bKeys, /* Output the keys from each segment */
|
|
int bVals, /* Output the values from each segment */
|
|
const char *zWhy /* Caption to print near top of dump */
|
|
){
|
|
Snapshot *pDump = pSnap;
|
|
Level *pTopLevel;
|
|
char *zFree = 0;
|
|
|
|
assert( pSnap );
|
|
pTopLevel = lsmDbSnapshotLevel(pDump);
|
|
if( pDb->xLog && pTopLevel ){
|
|
static int nCall = 0;
|
|
Level *pLevel;
|
|
int iLevel = 0;
|
|
|
|
nCall++;
|
|
lsmLogMessage(pDb, LSM_OK, "Database structure %d (%s)", nCall, zWhy);
|
|
|
|
#if 0
|
|
if( nCall==1031 || nCall==1032 ) bKeys=1;
|
|
#endif
|
|
|
|
for(pLevel=pTopLevel; pLevel; pLevel=pLevel->pNext){
|
|
char zLeft[1024];
|
|
char zRight[1024];
|
|
int i = 0;
|
|
|
|
Segment *aLeft[24];
|
|
Segment *aRight[24];
|
|
|
|
int nLeft = 0;
|
|
int nRight = 0;
|
|
|
|
Segment *pSeg = &pLevel->lhs;
|
|
aLeft[nLeft++] = pSeg;
|
|
|
|
for(i=0; i<pLevel->nRight; i++){
|
|
aRight[nRight++] = &pLevel->aRhs[i];
|
|
}
|
|
|
|
#ifdef LSM_LOG_FREELIST
|
|
if( nRight ){
|
|
memmove(&aRight[1], aRight, sizeof(aRight[0])*nRight);
|
|
aRight[0] = 0;
|
|
nRight++;
|
|
}
|
|
#endif
|
|
|
|
for(i=0; i<nLeft || i<nRight; i++){
|
|
int iPad = 0;
|
|
char zLevel[32];
|
|
zLeft[0] = '\0';
|
|
zRight[0] = '\0';
|
|
|
|
if( i<nLeft ){
|
|
fileToString(pDb, zLeft, sizeof(zLeft), 24, aLeft[i]);
|
|
}
|
|
if( i<nRight ){
|
|
fileToString(pDb, zRight, sizeof(zRight), 24, aRight[i]);
|
|
}
|
|
|
|
if( i==0 ){
|
|
snprintf(zLevel, sizeof(zLevel), "L%d: (age=%d) (flags=%.4x)",
|
|
iLevel, (int)pLevel->iAge, (int)pLevel->flags
|
|
);
|
|
}else{
|
|
zLevel[0] = '\0';
|
|
}
|
|
|
|
if( nRight==0 ){
|
|
iPad = 10;
|
|
}
|
|
|
|
lsmLogMessage(pDb, LSM_OK, "% 25s % *s% -35s %s",
|
|
zLevel, iPad, "", zLeft, zRight
|
|
);
|
|
}
|
|
|
|
iLevel++;
|
|
}
|
|
|
|
if( bKeys ){
|
|
for(pLevel=pTopLevel; pLevel; pLevel=pLevel->pNext){
|
|
int i;
|
|
sortedDumpSegment(pDb, &pLevel->lhs, bVals);
|
|
for(i=0; i<pLevel->nRight; i++){
|
|
sortedDumpSegment(pDb, &pLevel->aRhs[i], bVals);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
lsmInfoFreelist(pDb, &zFree);
|
|
lsmLogMessage(pDb, LSM_OK, "Freelist: %s", zFree);
|
|
lsmFree(pDb->pEnv, zFree);
|
|
|
|
assert( lsmFsIntegrityCheck(pDb) );
|
|
}
|
|
|
|
void lsmSortedFreeLevel(lsm_env *pEnv, Level *pLevel){
|
|
Level *pNext;
|
|
Level *p;
|
|
|
|
for(p=pLevel; p; p=pNext){
|
|
pNext = p->pNext;
|
|
sortedFreeLevel(pEnv, p);
|
|
}
|
|
}
|
|
|
|
void lsmSortedSaveTreeCursors(lsm_db *pDb){
|
|
MultiCursor *pCsr;
|
|
for(pCsr=pDb->pCsr; pCsr; pCsr=pCsr->pNext){
|
|
lsmTreeCursorSave(pCsr->apTreeCsr[0]);
|
|
lsmTreeCursorSave(pCsr->apTreeCsr[1]);
|
|
}
|
|
}
|
|
|
|
void lsmSortedExpandBtreePage(Page *pPg, int nOrig){
|
|
u8 *aData;
|
|
int nData;
|
|
int nEntry;
|
|
int iHdr;
|
|
|
|
aData = lsmFsPageData(pPg, &nData);
|
|
nEntry = pageGetNRec(aData, nOrig);
|
|
iHdr = SEGMENT_EOF(nOrig, nEntry);
|
|
memmove(&aData[iHdr + (nData-nOrig)], &aData[iHdr], nOrig-iHdr);
|
|
}
|
|
|
|
#ifdef LSM_DEBUG_EXPENSIVE
|
|
static void assertRunInOrder(lsm_db *pDb, Segment *pSeg){
|
|
Page *pPg = 0;
|
|
LsmBlob blob1 = {0, 0, 0, 0};
|
|
LsmBlob blob2 = {0, 0, 0, 0};
|
|
|
|
lsmFsDbPageGet(pDb->pFS, pSeg, pSeg->iFirst, &pPg);
|
|
while( pPg ){
|
|
u8 *aData; int nData;
|
|
Page *pNext;
|
|
|
|
aData = lsmFsPageData(pPg, &nData);
|
|
if( 0==(pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG) ){
|
|
int i;
|
|
int nRec = pageGetNRec(aData, nData);
|
|
for(i=0; i<nRec; i++){
|
|
int iTopic1, iTopic2;
|
|
pageGetKeyCopy(pDb->pEnv, pSeg, pPg, i, &iTopic1, &blob1);
|
|
|
|
if( i==0 && blob2.nData ){
|
|
assert( sortedKeyCompare(
|
|
pDb->xCmp, iTopic2, blob2.pData, blob2.nData,
|
|
iTopic1, blob1.pData, blob1.nData
|
|
)<0 );
|
|
}
|
|
|
|
if( i<(nRec-1) ){
|
|
pageGetKeyCopy(pDb->pEnv, pSeg, pPg, i+1, &iTopic2, &blob2);
|
|
assert( sortedKeyCompare(
|
|
pDb->xCmp, iTopic1, blob1.pData, blob1.nData,
|
|
iTopic2, blob2.pData, blob2.nData
|
|
)<0 );
|
|
}
|
|
}
|
|
}
|
|
|
|
lsmFsDbPageNext(pSeg, pPg, 1, &pNext);
|
|
lsmFsPageRelease(pPg);
|
|
pPg = pNext;
|
|
}
|
|
|
|
sortedBlobFree(&blob1);
|
|
sortedBlobFree(&blob2);
|
|
}
|
|
#endif
|
|
|
|
#ifdef LSM_DEBUG_EXPENSIVE
|
|
/*
|
|
** This function is only included in the build if LSM_DEBUG_EXPENSIVE is
|
|
** defined. Its only purpose is to evaluate various assert() statements to
|
|
** verify that the database is well formed in certain respects.
|
|
**
|
|
** More specifically, it checks that the array pOne contains the required
|
|
** pointers to pTwo. Array pTwo must be a main array. pOne may be either a
|
|
** separators array or another main array. If pOne does not contain the
|
|
** correct set of pointers, an assert() statement fails.
|
|
*/
|
|
static int assertPointersOk(
|
|
lsm_db *pDb, /* Database handle */
|
|
Segment *pOne, /* Segment containing pointers */
|
|
Segment *pTwo, /* Segment containing pointer targets */
|
|
int bRhs /* True if pTwo may have been Gobble()d */
|
|
){
|
|
int rc = LSM_OK; /* Error code */
|
|
SegmentPtr ptr1; /* Iterates through pOne */
|
|
SegmentPtr ptr2; /* Iterates through pTwo */
|
|
LsmPgno iPrev;
|
|
|
|
assert( pOne && pTwo );
|
|
|
|
memset(&ptr1, 0, sizeof(ptr1));
|
|
memset(&ptr2, 0, sizeof(ptr1));
|
|
ptr1.pSeg = pOne;
|
|
ptr2.pSeg = pTwo;
|
|
segmentPtrEndPage(pDb->pFS, &ptr1, 0, &rc);
|
|
segmentPtrEndPage(pDb->pFS, &ptr2, 0, &rc);
|
|
|
|
/* Check that the footer pointer of the first page of pOne points to
|
|
** the first page of pTwo. */
|
|
iPrev = pTwo->iFirst;
|
|
if( ptr1.iPtr!=iPrev && !bRhs ){
|
|
assert( 0 );
|
|
}
|
|
|
|
if( rc==LSM_OK && ptr1.nCell>0 ){
|
|
rc = segmentPtrLoadCell(&ptr1, 0);
|
|
}
|
|
|
|
while( rc==LSM_OK && ptr2.pPg ){
|
|
LsmPgno iThis;
|
|
|
|
/* Advance to the next page of segment pTwo that contains at least
|
|
** one cell. Break out of the loop if the iterator reaches EOF. */
|
|
do{
|
|
rc = segmentPtrNextPage(&ptr2, 1);
|
|
assert( rc==LSM_OK );
|
|
}while( rc==LSM_OK && ptr2.pPg && ptr2.nCell==0 );
|
|
if( rc!=LSM_OK || ptr2.pPg==0 ) break;
|
|
iThis = lsmFsPageNumber(ptr2.pPg);
|
|
|
|
if( (ptr2.flags & (PGFTR_SKIP_THIS_FLAG|SEGMENT_BTREE_FLAG))==0 ){
|
|
|
|
/* Load the first cell in the array pTwo page. */
|
|
rc = segmentPtrLoadCell(&ptr2, 0);
|
|
|
|
/* Iterate forwards through pOne, searching for a key that matches the
|
|
** key ptr2.pKey/nKey. This key should have a pointer to the page that
|
|
** ptr2 currently points to. */
|
|
while( rc==LSM_OK ){
|
|
int res = rtTopic(ptr1.eType) - rtTopic(ptr2.eType);
|
|
if( res==0 ){
|
|
res = pDb->xCmp(ptr1.pKey, ptr1.nKey, ptr2.pKey, ptr2.nKey);
|
|
}
|
|
|
|
if( res<0 ){
|
|
assert( bRhs || ptr1.iPtr+ptr1.iPgPtr==iPrev );
|
|
}else if( res>0 ){
|
|
assert( 0 );
|
|
}else{
|
|
assert( ptr1.iPtr+ptr1.iPgPtr==iThis );
|
|
iPrev = iThis;
|
|
break;
|
|
}
|
|
|
|
rc = segmentPtrAdvance(0, &ptr1, 0);
|
|
if( ptr1.pPg==0 ){
|
|
assert( 0 );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
segmentPtrReset(&ptr1, 0);
|
|
segmentPtrReset(&ptr2, 0);
|
|
return LSM_OK;
|
|
}
|
|
|
|
/*
|
|
** This function is only included in the build if LSM_DEBUG_EXPENSIVE is
|
|
** defined. Its only purpose is to evaluate various assert() statements to
|
|
** verify that the database is well formed in certain respects.
|
|
**
|
|
** More specifically, it checks that the b-tree embedded in array pRun
|
|
** contains the correct keys. If not, an assert() fails.
|
|
*/
|
|
static int assertBtreeOk(
|
|
lsm_db *pDb,
|
|
Segment *pSeg
|
|
){
|
|
int rc = LSM_OK; /* Return code */
|
|
if( pSeg->iRoot ){
|
|
LsmBlob blob = {0, 0, 0}; /* Buffer used to cache overflow keys */
|
|
FileSystem *pFS = pDb->pFS; /* File system to read from */
|
|
Page *pPg = 0; /* Main run page */
|
|
BtreeCursor *pCsr = 0; /* Btree cursor */
|
|
|
|
rc = btreeCursorNew(pDb, pSeg, &pCsr);
|
|
if( rc==LSM_OK ){
|
|
rc = btreeCursorFirst(pCsr);
|
|
}
|
|
if( rc==LSM_OK ){
|
|
rc = lsmFsDbPageGet(pFS, pSeg, pSeg->iFirst, &pPg);
|
|
}
|
|
|
|
while( rc==LSM_OK ){
|
|
Page *pNext;
|
|
u8 *aData;
|
|
int nData;
|
|
int flags;
|
|
|
|
rc = lsmFsDbPageNext(pSeg, pPg, 1, &pNext);
|
|
lsmFsPageRelease(pPg);
|
|
pPg = pNext;
|
|
if( pPg==0 ) break;
|
|
aData = fsPageData(pPg, &nData);
|
|
flags = pageGetFlags(aData, nData);
|
|
if( rc==LSM_OK
|
|
&& 0==((SEGMENT_BTREE_FLAG|PGFTR_SKIP_THIS_FLAG) & flags)
|
|
&& 0!=pageGetNRec(aData, nData)
|
|
){
|
|
u8 *pKey;
|
|
int nKey;
|
|
int iTopic;
|
|
pKey = pageGetKey(pSeg, pPg, 0, &iTopic, &nKey, &blob);
|
|
assert( nKey==pCsr->nKey && 0==memcmp(pKey, pCsr->pKey, nKey) );
|
|
assert( lsmFsPageNumber(pPg)==pCsr->iPtr );
|
|
rc = btreeCursorNext(pCsr);
|
|
}
|
|
}
|
|
assert( rc!=LSM_OK || pCsr->pKey==0 );
|
|
|
|
if( pPg ) lsmFsPageRelease(pPg);
|
|
|
|
btreeCursorFree(pCsr);
|
|
sortedBlobFree(&blob);
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
#endif /* ifdef LSM_DEBUG_EXPENSIVE */
|