libsql/libsql-sqlite3/src/vectorIndexInt.h

#ifndef _VECTOR_INDEX_H
#define _VECTOR_INDEX_H

#include "sqlite3.h"
#include "vectorInt.h"

#ifdef __cplusplus
extern "C" {
#endif

typedef struct DiskAnnIndex DiskAnnIndex;
typedef struct BlobSpot BlobSpot;

/*
 * Main type which holds all necessary index information and will be passed as a first argument in all index-related operations
*/
struct DiskAnnIndex {
  sqlite3 *db;         /* Database connection */
  char *zDbSName;      /* Database name */
  char *zName;         /* Index name */
  char *zShadow;       /* Shadow table name */
  int nFormatVersion;  /* DiskAnn format version */
  int nDistanceFunc;   /* Distance function */
  int nBlockSize;      /* Size of the block which stores all data for single node */
  int nVectorDims;     /* Vector dimensions */
  int nNodeVectorType; /* Vector type of each node */
  int nEdgeVectorType; /* Vector type of each edge */
  int nNodeVectorSize; /* Vector size of each node in bytes */
  int nEdgeVectorSize; /* Vector size of each edge in bytes */
  float pruningAlpha;  /* Alpha parameter for edge pruning during INSERT operation */
  int insertL;         /* Max size of candidate set (L) visited during INSERT operation */
  int searchL;         /* Max size of candidate set (L) visited during SEARCH operation (can be overriden from query in future) */
};

/*
 * Simple utility class which holds sqlite3_blob handle poiting to the nRowid (undefined if pBlob == NULL)
 * Caller can re-load BlobSpot with blobSpotReload(...) method which will reopen blob at new row position
 * sqlite3_blob_reopen API can be visibly faster than close/open pair since a lot of check can be omitted
*/
struct BlobSpot {
  u64 nRowid;           /* last rowid for which open/reopen was called; undefined if BlobSpot was never opened */
  sqlite3_blob *pBlob;  /* BLOB handle */
  u8 *pBuffer;          /* buffer for BLOB data */
  int nBufferSize;      /* buffer size */
  u8 isWritable;        /* blob open mode (readonly or read/write) */
  u8 isInitialized;     /* was blob read after creation or not */
  u8 isAborted;         /* set to true if last operation with blob failed with non-zero code */
};

/* Special error code for blobSpotCreate/blobSpotReload functions which will fire where rowid doesn't exists in the table */
#define DISKANN_ROW_NOT_FOUND 1001

#define DISKANN_BLOB_WRITABLE 1
#define DISKANN_BLOB_READONLY 0

/* BlobSpot operations */
int blobSpotCreate(const DiskAnnIndex *pIndex, BlobSpot **ppBlobSpot, u64 nRowid, int nBufferSize, int isWritable);
int blobSpotReload(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, u64 nRowid, int nBufferSize);
int blobSpotFlush(BlobSpot *pBlobSpot);
void blobSpotFree(BlobSpot *pBlobSpot);

/*
 * Accessor for node binary format
 * - v1 format is the following:
 *   [u64 nRowid] [u16 nEdges] [node vector] [edge vector] * nEdges [trash vector] * (nMaxEdges - nEdges) ([u64 legacyField] [u64 edgeId]) * nEdges
 *   Note, that node vector and edge vector can have different representations (and edge vector can be smaller in size than node vector)
*/
int nodeEdgesMaxCount(const DiskAnnIndex *pIndex);
int nodeEdgesMetadataOffset(const DiskAnnIndex *pIndex);
void nodeBinInit(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, u64 nRowid, Vector *pVector);
void nodeBinVector(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, Vector *pVector);
u16 nodeBinEdges(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot);
void nodeBinEdge(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, int iEdge, u64 *pRowid, Vector *pVector);
int nodeBinEdgeFindIdx(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, u64 nRowid);
void nodeBinPruneEdges(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int nPruned);
void nodeBinReplaceEdge(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int iReplace, u64 nRowid, Vector *pVector);
void nodeBinDeleteEdge(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int iDelete);
void nodeBinDebug(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot);

/**************************************************************************
** Vector index utilities
****************************************************************************/

/* Vector index utility objects */
typedef struct VectorIdxKey VectorIdxKey;
typedef struct VectorIdxParams VectorIdxParams;
typedef struct VectorInRow VectorInRow;
typedef struct VectorOutRows VectorOutRows;

typedef u8 IndexType;
typedef u8 MetricType;

/*
 * All vector index parameters must be known to the vectorIndex module although it's interpretation are up to the specific implementation of the index
 * (so, there is no validation of parameter values in the vectorIndex module - all this work must be delegated to the specific implementation)
 * All enum-like type constants starts with 1 to make 0 an "unset" value placeholder
*/

/* format version which can help to upgrade vector on-disk format without breaking older version of the db */
#define VECTOR_FORMAT_PARAM_ID         1
/*
 * 1 - initial version
*/
#define VECTOR_FORMAT_DEFAULT          1

/* type of the vector index */
#define VECTOR_INDEX_TYPE_PARAM_ID     2
#define VECTOR_INDEX_TYPE_DISKANN      1

/* type of the underlying vector for the vector index */
#define VECTOR_TYPE_PARAM_ID           3
/* dimension of the underlying vector for the vector index */
#define VECTOR_DIM_PARAM_ID            4

/* metric type used for comparing two vectors */
#define VECTOR_METRIC_TYPE_PARAM_ID    5
#define VECTOR_METRIC_TYPE_COS         1
#define VECTOR_METRIC_TYPE_L2          2

/* block size */
#define VECTOR_BLOCK_SIZE_PARAM_ID     6
#define VECTOR_BLOCK_SIZE_DEFAULT      128

#define VECTOR_PRUNING_ALPHA_PARAM_ID  7
#define VECTOR_PRUNING_ALPHA_DEFAULT   1.2

#define VECTOR_INSERT_L_PARAM_ID       8
#define VECTOR_INSERT_L_DEFAULT        70

#define VECTOR_SEARCH_L_PARAM_ID       9
#define VECTOR_SEARCH_L_DEFAULT        200

#define VECTOR_MAX_NEIGHBORS_PARAM_ID  10

/* total amount of vector index parameters */
#define VECTOR_PARAM_IDS_COUNT         9

/*
 * Vector index parameters are stored in simple binary format (1 byte tag + 8 byte u64 integer / f64 float)
 * This will allow us to add parameters in future version more easily as we have full control over the format (compared to the "rigid" SQL schema)
 * For now, VectorIdxParams allocated on stack and have 128 bytes hard limit (so far we have 9 parameters and 72 are enough for us)
*/
#define VECTOR_INDEX_PARAMS_BUF_SIZE 128
struct VectorIdxParams {
  u8 pBinBuf[VECTOR_INDEX_PARAMS_BUF_SIZE];
  int nBinSize;
};


/*
 * Structure which holds information about primary key of the base table for vector index
 * For tables with ROWID only this structure will have information about single column with INTEGER affinity and BINARY collation
 * For now, VectorIdxKey allocated on stack have 16 columns hard limit (for now we are not supporting composite primary keys due to the limitation of virtual tables)
*/
#define VECTOR_INDEX_MAX_KEY_COLUMNS 16
struct VectorIdxKey {
  int nKeyColumns;
  char aKeyAffinity[VECTOR_INDEX_MAX_KEY_COLUMNS];
  /* collation is owned by the caller and structure is not responsible for reclamation of collation string resources */
  const char *azKeyCollation[VECTOR_INDEX_MAX_KEY_COLUMNS];
};

/*
 * Structure which holds information about input payload for vector index (for INSERT/DELETE operations)
 * pVector must be NULL for DELETE operation
 *
 * Resources must be reclaimed with vectorInRowFree(...) method
*/
struct VectorInRow {
  Vector *pVector;
  int nKeys;
  sqlite3_value *pKeyValues;
};

/*
 * Structure which holds information about result set of SEARCH operation
 * It have special optimization for cases when single INTEGER primary key is used - in this case aIntValues array stores all values instead of ppValues
 * In other case generic ppValues stores all column information
 *
 * Resources must be reclaimed with vectorOutRowsFree(...) method
*/
#define VECTOR_OUT_ROWS_MAX_CELLS (1<<30)
struct VectorOutRows {
  int nRows;
  int nCols;
  i64 *aIntValues;
  sqlite3_value **ppValues;
};

// limit to the sql part which we render in order to perform operations with shadow tables
// we render this parts of SQL on stack - thats why we have hard limit on this
// stack simplify memory managment code and also doesn't impose very strict limits here since 128 bytes for column names should be enough for almost all use cases
#define VECTOR_INDEX_SQL_RENDER_LIMIT 128

void vectorIdxParamsInit(VectorIdxParams *, u8 *, int);
u64 vectorIdxParamsGetU64(const VectorIdxParams *, char);
double vectorIdxParamsGetF64(const VectorIdxParams *, char);
int vectorIdxParamsPutU64(VectorIdxParams *, char, u64);
int vectorIdxParamsPutF64(VectorIdxParams *, char, double);

int vectorIdxKeyGet(const Index *, VectorIdxKey *, const char **);
int vectorIdxKeyRowidLike(const VectorIdxKey *);
int vectorIdxKeyDefsRender(const VectorIdxKey *, const char *, char *, int);
int vectorIdxKeyNamesRender(int, const char *, char *, int);

int vectorInRowAlloc(sqlite3 *, const UnpackedRecord *, VectorInRow *, char **);
sqlite3_value* vectorInRowKey(const VectorInRow *, int);
int vectorInRowTryGetRowid(const VectorInRow *, u64 *);
i64 vectorInRowLegacyId(const VectorInRow *);
int vectorInRowPlaceholderRender(const VectorInRow *, char *, int);
void vectorInRowFree(sqlite3 *, VectorInRow *);

int vectorOutRowsAlloc(sqlite3 *, VectorOutRows *, int, int, int);
int vectorOutRowsPut(VectorOutRows *, int, int, const u64 *, sqlite3_value *);
void vectorOutRowsGet(sqlite3_context *, const VectorOutRows *, int, int);
void vectorOutRowsFree(sqlite3 *, VectorOutRows *);

int diskAnnCreateIndex(sqlite3 *, const char *, const char *, const VectorIdxKey *, VectorIdxParams *);
int diskAnnClearIndex(sqlite3 *, const char *, const char *);
int diskAnnDropIndex(sqlite3 *, const char *, const char *);
int diskAnnOpenIndex(sqlite3 *, const char *, const char *, const VectorIdxParams *, DiskAnnIndex **);
void diskAnnCloseIndex(DiskAnnIndex *);
int diskAnnInsert(const DiskAnnIndex *, const VectorInRow *, char **);
int diskAnnDelete(const DiskAnnIndex *, const VectorInRow *, char **);
int diskAnnSearch(const DiskAnnIndex *, const Vector *, int, const VectorIdxKey *, VectorOutRows *, char **);

typedef struct VectorIdxCursor VectorIdxCursor;

#define VECTOR_INDEX_VTAB_NAME         "vector_top_k"
#define VECTOR_INDEX_GLOBAL_META_TABLE "libsql_vector_meta_shadow"
#define VECTOR_INDEX_MARKER_FUNCTION   "libsql_vector_idx"

int vectorIdxParseColumnType(const char *, int *, int *, const char **);

int vectorIndexCreate(Parse*, const Index*, const char *, const IdList*);
int vectorIndexClear(sqlite3 *, const char *, const char *);
int vectorIndexDrop(sqlite3 *, const char *, const char *);
int vectorIndexSearch(sqlite3 *, const char *, int, sqlite3_value **, VectorOutRows *, char **);
int vectorIndexCursorInit(sqlite3 *, const char *, const char *, VectorIdxCursor **);
void vectorIndexCursorClose(sqlite3 *, VectorIdxCursor *);
int vectorIndexInsert(VectorIdxCursor *, const UnpackedRecord *, char **);
int vectorIndexDelete(VectorIdxCursor *, const UnpackedRecord *, char **);

#ifdef __cplusplus
}  /* end of the 'extern "C"' block */
#endif

#endif /* _VECTOR_INDEX_H */