0
0
mirror of https://github.com/tursodatabase/libsql.git synced 2024-12-16 10:18:47 +00:00
libsql/libsql-sqlite3/src/vectorIndexInt.h
2024-07-26 22:51:36 +04:00

250 lines
11 KiB
C

#ifndef _VECTOR_INDEX_H
#define _VECTOR_INDEX_H
#include "sqlite3.h"
#include "vectorInt.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct DiskAnnIndex DiskAnnIndex;
typedef struct BlobSpot BlobSpot;
/*
* Main type which holds all necessary index information and will be passed as a first argument in all index-related operations
*/
struct DiskAnnIndex {
sqlite3 *db; /* Database connection */
char *zDbSName; /* Database name */
char *zName; /* Index name */
char *zShadow; /* Shadow table name */
int nFormatVersion; /* DiskAnn format version */
int nDistanceFunc; /* Distance function */
int nBlockSize; /* Size of the block which stores all data for single node */
int nVectorDims; /* Vector dimensions */
int nNodeVectorType; /* Vector type of each node */
int nEdgeVectorType; /* Vector type of each edge */
int nNodeVectorSize; /* Vector size of each node in bytes */
int nEdgeVectorSize; /* Vector size of each edge in bytes */
float pruningAlpha; /* Alpha parameter for edge pruning during INSERT operation */
int insertL; /* Max size of candidate set (L) visited during INSERT operation */
int searchL; /* Max size of candidate set (L) visited during SEARCH operation (can be overriden from query in future) */
};
/*
* Simple utility class which holds sqlite3_blob handle poiting to the nRowid (undefined if pBlob == NULL)
* Caller can re-load BlobSpot with blobSpotReload(...) method which will reopen blob at new row position
* sqlite3_blob_reopen API can be visibly faster than close/open pair since a lot of check can be omitted
*/
struct BlobSpot {
u64 nRowid; /* last rowid for which open/reopen was called; undefined if BlobSpot was never opened */
sqlite3_blob *pBlob; /* BLOB handle */
u8 *pBuffer; /* buffer for BLOB data */
int nBufferSize; /* buffer size */
u8 isWritable; /* blob open mode (readonly or read/write) */
u8 isInitialized; /* was blob read after creation or not */
u8 isAborted; /* set to true if last operation with blob failed with non-zero code */
};
/* Special error code for blobSpotCreate/blobSpotReload functions which will fire where rowid doesn't exists in the table */
#define DISKANN_ROW_NOT_FOUND 1001
#define DISKANN_BLOB_WRITABLE 1
#define DISKANN_BLOB_READONLY 0
/* BlobSpot operations */
int blobSpotCreate(const DiskAnnIndex *pIndex, BlobSpot **ppBlobSpot, u64 nRowid, int nBufferSize, int isWritable);
int blobSpotReload(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, u64 nRowid, int nBufferSize);
int blobSpotFlush(BlobSpot *pBlobSpot);
void blobSpotFree(BlobSpot *pBlobSpot);
/*
* Accessor for node binary format
* - v1 format is the following:
* [u64 nRowid] [u16 nEdges] [node vector] [edge vector] * nEdges [trash vector] * (nMaxEdges - nEdges) ([u64 legacyField] [u64 edgeId]) * nEdges
* Note, that node vector and edge vector can have different representations (and edge vector can be smaller in size than node vector)
*/
int nodeEdgesMaxCount(const DiskAnnIndex *pIndex);
int nodeEdgesMetadataOffset(const DiskAnnIndex *pIndex);
void nodeBinInit(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, u64 nRowid, Vector *pVector);
void nodeBinVector(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, Vector *pVector);
u16 nodeBinEdges(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot);
void nodeBinEdge(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, int iEdge, u64 *pRowid, Vector *pVector);
int nodeBinEdgeFindIdx(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, u64 nRowid);
void nodeBinPruneEdges(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int nPruned);
void nodeBinReplaceEdge(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int iReplace, u64 nRowid, Vector *pVector);
void nodeBinDeleteEdge(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int iDelete);
void nodeBinDebug(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot);
/**************************************************************************
** Vector index utilities
****************************************************************************/
/* Vector index utility objects */
typedef struct VectorIdxKey VectorIdxKey;
typedef struct VectorIdxParams VectorIdxParams;
typedef struct VectorInRow VectorInRow;
typedef struct VectorOutRows VectorOutRows;
typedef u8 IndexType;
typedef u8 MetricType;
/*
* All vector index parameters must be known to the vectorIndex module although it's interpretation are up to the specific implementation of the index
* (so, there is no validation of parameter values in the vectorIndex module - all this work must be delegated to the specific implementation)
* All enum-like type constants starts with 1 to make 0 an "unset" value placeholder
*/
/* format version which can help to upgrade vector on-disk format without breaking older version of the db */
#define VECTOR_FORMAT_PARAM_ID 1
/*
* 1 - initial version
*/
#define VECTOR_FORMAT_DEFAULT 1
/* type of the vector index */
#define VECTOR_INDEX_TYPE_PARAM_ID 2
#define VECTOR_INDEX_TYPE_DISKANN 1
/* type of the underlying vector for the vector index */
#define VECTOR_TYPE_PARAM_ID 3
/* dimension of the underlying vector for the vector index */
#define VECTOR_DIM_PARAM_ID 4
/* metric type used for comparing two vectors */
#define VECTOR_METRIC_TYPE_PARAM_ID 5
#define VECTOR_METRIC_TYPE_COS 1
#define VECTOR_METRIC_TYPE_L2 2
/* block size */
#define VECTOR_BLOCK_SIZE_PARAM_ID 6
#define VECTOR_BLOCK_SIZE_DEFAULT 128
#define VECTOR_PRUNING_ALPHA_PARAM_ID 7
#define VECTOR_PRUNING_ALPHA_DEFAULT 1.2
#define VECTOR_INSERT_L_PARAM_ID 8
#define VECTOR_INSERT_L_DEFAULT 70
#define VECTOR_SEARCH_L_PARAM_ID 9
#define VECTOR_SEARCH_L_DEFAULT 200
#define VECTOR_MAX_NEIGHBORS_PARAM_ID 10
/* total amount of vector index parameters */
#define VECTOR_PARAM_IDS_COUNT 9
/*
* Vector index parameters are stored in simple binary format (1 byte tag + 8 byte u64 integer / f64 float)
* This will allow us to add parameters in future version more easily as we have full control over the format (compared to the "rigid" SQL schema)
* For now, VectorIdxParams allocated on stack and have 128 bytes hard limit (so far we have 9 parameters and 72 are enough for us)
*/
#define VECTOR_INDEX_PARAMS_BUF_SIZE 128
struct VectorIdxParams {
u8 pBinBuf[VECTOR_INDEX_PARAMS_BUF_SIZE];
int nBinSize;
};
/*
* Structure which holds information about primary key of the base table for vector index
* For tables with ROWID only this structure will have information about single column with INTEGER affinity and BINARY collation
* For now, VectorIdxKey allocated on stack have 16 columns hard limit (for now we are not supporting composite primary keys due to the limitation of virtual tables)
*/
#define VECTOR_INDEX_MAX_KEY_COLUMNS 16
struct VectorIdxKey {
int nKeyColumns;
char aKeyAffinity[VECTOR_INDEX_MAX_KEY_COLUMNS];
/* collation is owned by the caller and structure is not responsible for reclamation of collation string resources */
const char *azKeyCollation[VECTOR_INDEX_MAX_KEY_COLUMNS];
};
/*
* Structure which holds information about input payload for vector index (for INSERT/DELETE operations)
* pVector must be NULL for DELETE operation
*
* Resources must be reclaimed with vectorInRowFree(...) method
*/
struct VectorInRow {
Vector *pVector;
int nKeys;
sqlite3_value *pKeyValues;
};
/*
* Structure which holds information about result set of SEARCH operation
* It have special optimization for cases when single INTEGER primary key is used - in this case aIntValues array stores all values instead of ppValues
* In other case generic ppValues stores all column information
*
* Resources must be reclaimed with vectorOutRowsFree(...) method
*/
#define VECTOR_OUT_ROWS_MAX_CELLS (1<<30)
struct VectorOutRows {
int nRows;
int nCols;
i64 *aIntValues;
sqlite3_value **ppValues;
};
// limit to the sql part which we render in order to perform operations with shadow tables
// we render this parts of SQL on stack - thats why we have hard limit on this
// stack simplify memory managment code and also doesn't impose very strict limits here since 128 bytes for column names should be enough for almost all use cases
#define VECTOR_INDEX_SQL_RENDER_LIMIT 128
void vectorIdxParamsInit(VectorIdxParams *, u8 *, int);
u64 vectorIdxParamsGetU64(const VectorIdxParams *, char);
double vectorIdxParamsGetF64(const VectorIdxParams *, char);
int vectorIdxParamsPutU64(VectorIdxParams *, char, u64);
int vectorIdxParamsPutF64(VectorIdxParams *, char, double);
int vectorIdxKeyGet(const Index *, VectorIdxKey *, const char **);
int vectorIdxKeyRowidLike(const VectorIdxKey *);
int vectorIdxKeyDefsRender(const VectorIdxKey *, const char *, char *, int);
int vectorIdxKeyNamesRender(int, const char *, char *, int);
int vectorInRowAlloc(sqlite3 *, const UnpackedRecord *, VectorInRow *, char **);
sqlite3_value* vectorInRowKey(const VectorInRow *, int);
int vectorInRowTryGetRowid(const VectorInRow *, u64 *);
i64 vectorInRowLegacyId(const VectorInRow *);
int vectorInRowPlaceholderRender(const VectorInRow *, char *, int);
void vectorInRowFree(sqlite3 *, VectorInRow *);
int vectorOutRowsAlloc(sqlite3 *, VectorOutRows *, int, int, int);
int vectorOutRowsPut(VectorOutRows *, int, int, const u64 *, sqlite3_value *);
void vectorOutRowsGet(sqlite3_context *, const VectorOutRows *, int, int);
void vectorOutRowsFree(sqlite3 *, VectorOutRows *);
int diskAnnCreateIndex(sqlite3 *, const char *, const char *, const VectorIdxKey *, VectorIdxParams *);
int diskAnnClearIndex(sqlite3 *, const char *, const char *);
int diskAnnDropIndex(sqlite3 *, const char *, const char *);
int diskAnnOpenIndex(sqlite3 *, const char *, const char *, const VectorIdxParams *, DiskAnnIndex **);
void diskAnnCloseIndex(DiskAnnIndex *);
int diskAnnInsert(const DiskAnnIndex *, const VectorInRow *, char **);
int diskAnnDelete(const DiskAnnIndex *, const VectorInRow *, char **);
int diskAnnSearch(const DiskAnnIndex *, const Vector *, int, const VectorIdxKey *, VectorOutRows *, char **);
typedef struct VectorIdxCursor VectorIdxCursor;
#define VECTOR_INDEX_VTAB_NAME "vector_top_k"
#define VECTOR_INDEX_GLOBAL_META_TABLE "libsql_vector_meta_shadow"
#define VECTOR_INDEX_MARKER_FUNCTION "libsql_vector_idx"
int vectorIdxParseColumnType(const char *, int *, int *, const char **);
int vectorIndexCreate(Parse*, const Index*, const char *, const IdList*);
int vectorIndexClear(sqlite3 *, const char *, const char *);
int vectorIndexDrop(sqlite3 *, const char *, const char *);
int vectorIndexSearch(sqlite3 *, const char *, int, sqlite3_value **, VectorOutRows *, char **);
int vectorIndexCursorInit(sqlite3 *, const char *, const char *, VectorIdxCursor **);
void vectorIndexCursorClose(sqlite3 *, VectorIdxCursor *);
int vectorIndexInsert(VectorIdxCursor *, const UnpackedRecord *, char **);
int vectorIndexDelete(VectorIdxCursor *, const UnpackedRecord *, char **);
#ifdef __cplusplus
} /* end of the 'extern "C"' block */
#endif
#endif /* _VECTOR_INDEX_H */