0
0
mirror of https://github.com/tursodatabase/libsql.git synced 2025-05-30 17:02:49 +00:00

add 1bit vector type

This commit is contained in:
Nikita Sivukhin
2024-08-05 18:14:58 +04:00
committed by Sivukhin Nikita
parent 83d029d7e9
commit 39e30ead5e
8 changed files with 190 additions and 42 deletions

@ -195,7 +195,7 @@ LIBOBJS0 = alter.lo analyze.lo attach.lo auth.lo \
sqlite3session.lo select.lo sqlite3rbu.lo status.lo stmt.lo \
table.lo threads.lo tokenize.lo treeview.lo trigger.lo \
update.lo userauth.lo upsert.lo util.lo vacuum.lo \
vector.lo vectorfloat32.lo vectorfloat64.lo \
vector.lo vectorfloat32.lo vectorfloat64.lo vector1bit.lo \
vectorIndex.lo vectordiskann.lo vectorvtab.lo \
vdbe.lo vdbeapi.lo vdbeaux.lo vdbeblob.lo vdbemem.lo vdbesort.lo \
vdbetrace.lo vdbevtab.lo \
@ -302,6 +302,7 @@ SRC = \
$(TOP)/src/util.c \
$(TOP)/src/vacuum.c \
$(TOP)/src/vector.c \
$(TOP)/src/vector1bit.c \
$(TOP)/src/vectorInt.h \
$(TOP)/src/vectorfloat32.c \
$(TOP)/src/vectorfloat64.c \
@ -1138,6 +1139,9 @@ vacuum.lo: $(TOP)/src/vacuum.c $(HDR)
vector.lo: $(TOP)/src/vector.c $(HDR)
$(LTCOMPILE) $(TEMP_STORE) -c $(TOP)/src/vector.c
vector1bit.lo: $(TOP)/src/vector1bit.c $(HDR)
$(LTCOMPILE) $(TEMP_STORE) -c $(TOP)/src/vector1bit.c
vectorfloat32.lo: $(TOP)/src/vectorfloat32.c $(HDR)
$(LTCOMPILE) $(TEMP_STORE) -c $(TOP)/src/vectorfloat32.c

@ -41,6 +41,8 @@ size_t vectorDataSize(VectorType type, VectorDims dims){
return dims * sizeof(float);
case VECTOR_TYPE_FLOAT64:
return dims * sizeof(double);
case VECTOR_TYPE_1BIT:
return (dims + 7) / 8;
default:
assert(0);
}
@ -111,6 +113,8 @@ float vectorDistanceCos(const Vector *pVector1, const Vector *pVector2){
return vectorF32DistanceCos(pVector1, pVector2);
case VECTOR_TYPE_FLOAT64:
return vectorF64DistanceCos(pVector1, pVector2);
case VECTOR_TYPE_1BIT:
return vector1BitDistanceHamming(pVector1, pVector2);
default:
assert(0);
}
@ -381,6 +385,9 @@ void vectorDump(const Vector *pVector){
case VECTOR_TYPE_FLOAT64:
vectorF64Dump(pVector);
break;
case VECTOR_TYPE_1BIT:
vector1BitDump(pVector);
break;
default:
assert(0);
}
@ -451,6 +458,8 @@ size_t vectorSerializeToBlob(const Vector *pVector, unsigned char *pBlob, size_t
return vectorF32SerializeToBlob(pVector, pBlob, nBlobSize);
case VECTOR_TYPE_FLOAT64:
return vectorF64SerializeToBlob(pVector, pBlob, nBlobSize);
case VECTOR_TYPE_1BIT:
return vector1BitSerializeToBlob(pVector, pBlob, nBlobSize);
default:
assert(0);
}

@ -0,0 +1,110 @@
/*
** 2024-07-04
**
** Copyright 2024 the libSQL authors
**
** Permission is hereby granted, free of charge, to any person obtaining a copy of
** this software and associated documentation files (the "Software"), to deal in
** the Software without restriction, including without limitation the rights to
** use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
** the Software, and to permit persons to whom the Software is furnished to do so,
** subject to the following conditions:
**
** The above copyright notice and this permission notice shall be included in all
** copies or substantial portions of the Software.
**
** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
** FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
** COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
** IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
** CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
**
******************************************************************************
**
** 1-bit vector format utilities.
*/
#ifndef SQLITE_OMIT_VECTOR
#include "sqliteInt.h"
#include "vectorInt.h"
#include <math.h>
/**************************************************************************
** Utility routines for debugging
**************************************************************************/
void vector1BitDump(const Vector *pVec){
u8 *elems = pVec->data;
unsigned i;
assert( pVec->type == VECTOR_TYPE_1BIT );
for(i = 0; i < pVec->dims; i++){
printf("%d ", ((elems[i / 8] >> (i & 7)) & 1) ? +1 : -1);
}
printf("\n");
}
/**************************************************************************
** Utility routines for vector serialization and deserialization
**************************************************************************/
size_t vector1BitSerializeToBlob(
const Vector *pVector,
unsigned char *pBlob,
size_t nBlobSize
){
float *elems = pVector->data;
unsigned char *pPtr = pBlob;
size_t len = 0;
unsigned i;
assert( pVector->type == VECTOR_TYPE_1BIT );
assert( pVector->dims <= MAX_VECTOR_SZ );
assert( nBlobSize >= (pVector->dims + 7) / 8 );
for(i = 0; i < pVector->dims; i++){
elems[i] = pPtr[i];
}
return (pVector->dims + 7) / 8;
}
// [sum(map(int, bin(i)[2:])) for i in range(256)]
static int BitsCount[256] = {
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
};
int vector1BitDistanceHamming(const Vector *v1, const Vector *v2){
int sum = 0;
u8 *e1 = v1->data;
u8 *e2 = v2->data;
int i;
assert( v1->dims == v2->dims );
assert( v1->type == VECTOR_TYPE_1BIT );
assert( v2->type == VECTOR_TYPE_1BIT );
for(i = 0; i < v1->dims; i++){
sum += BitsCount[e1[i]&e2[i]];
}
return sum;
}
#endif /* !defined(SQLITE_OMIT_VECTOR) */

@ -396,13 +396,14 @@ struct VectorParamName {
};
static struct VectorParamName VECTOR_PARAM_NAMES[] = {
{ "type", VECTOR_INDEX_TYPE_PARAM_ID, 0, "diskann", VECTOR_INDEX_TYPE_DISKANN },
{ "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "cosine", VECTOR_METRIC_TYPE_COS },
{ "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "l2", VECTOR_METRIC_TYPE_L2 },
{ "alpha", VECTOR_PRUNING_ALPHA_PARAM_ID, 2, 0, 0 },
{ "search_l", VECTOR_SEARCH_L_PARAM_ID, 1, 0, 0 },
{ "insert_l", VECTOR_INSERT_L_PARAM_ID, 1, 0, 0 },
{ "max_neighbors", VECTOR_MAX_NEIGHBORS_PARAM_ID, 1, 0, 0 },
{ "type", VECTOR_INDEX_TYPE_PARAM_ID, 0, "diskann", VECTOR_INDEX_TYPE_DISKANN },
{ "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "cosine", VECTOR_METRIC_TYPE_COS },
{ "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "l2", VECTOR_METRIC_TYPE_L2 },
{ "compress_neighbors", VECTOR_METRIC_TYPE_PARAM_ID, 0, "1bit", VECTOR_TYPE_1BIT },
{ "alpha", VECTOR_PRUNING_ALPHA_PARAM_ID, 2, 0, 0 },
{ "search_l", VECTOR_SEARCH_L_PARAM_ID, 1, 0, 0 },
{ "insert_l", VECTOR_INSERT_L_PARAM_ID, 1, 0, 0 },
{ "max_neighbors", VECTOR_MAX_NEIGHBORS_PARAM_ID, 1, 0, 0 },
};
static int parseVectorIdxParam(const char *zParam, VectorIdxParams *pParams, const char **pErrMsg) {
@ -802,7 +803,7 @@ int vectorIndexCreate(Parse *pParse, const Index *pIdx, const char *zDbSName, co
int i, rc = SQLITE_OK;
int dims, type;
int hasLibsqlVectorIdxFn = 0, hasCollation = 0;
const char *pzErrMsg;
const char *pzErrMsg = NULL;
assert( zDbSName != NULL );
@ -914,9 +915,13 @@ int vectorIndexCreate(Parse *pParse, const Index *pIdx, const char *zDbSName, co
sqlite3ErrorMsg(pParse, "vector index: unsupported for tables without ROWID and composite primary key");
return CREATE_FAIL;
}
rc = diskAnnCreateIndex(db, zDbSName, pIdx->zName, &idxKey, &idxParams);
rc = diskAnnCreateIndex(db, zDbSName, pIdx->zName, &idxKey, &idxParams, &pzErrMsg);
if( rc != SQLITE_OK ){
sqlite3ErrorMsg(pParse, "vector index: unable to initialize diskann");
if( pzErrMsg != NULL ){
sqlite3ErrorMsg(pParse, "vector index: unable to initialize diskann: %s", pzErrMsg);
}else{
sqlite3ErrorMsg(pParse, "vector index: unable to initialize diskann");
}
return CREATE_FAIL;
}
rc = insertIndexParameters(db, zDbSName, pIdx->zName, &idxParams);

@ -100,43 +100,45 @@ typedef u8 MetricType;
*/
/* format version which can help to upgrade vector on-disk format without breaking older version of the db */
#define VECTOR_FORMAT_PARAM_ID 1
#define VECTOR_FORMAT_PARAM_ID 1
/*
* 1 - initial version
*/
#define VECTOR_FORMAT_DEFAULT 1
#define VECTOR_FORMAT_DEFAULT 1
/* type of the vector index */
#define VECTOR_INDEX_TYPE_PARAM_ID 2
#define VECTOR_INDEX_TYPE_DISKANN 1
#define VECTOR_INDEX_TYPE_PARAM_ID 2
#define VECTOR_INDEX_TYPE_DISKANN 1
/* type of the underlying vector for the vector index */
#define VECTOR_TYPE_PARAM_ID 3
#define VECTOR_TYPE_PARAM_ID 3
/* dimension of the underlying vector for the vector index */
#define VECTOR_DIM_PARAM_ID 4
#define VECTOR_DIM_PARAM_ID 4
/* metric type used for comparing two vectors */
#define VECTOR_METRIC_TYPE_PARAM_ID 5
#define VECTOR_METRIC_TYPE_COS 1
#define VECTOR_METRIC_TYPE_L2 2
#define VECTOR_METRIC_TYPE_PARAM_ID 5
#define VECTOR_METRIC_TYPE_COS 1
#define VECTOR_METRIC_TYPE_L2 2
/* block size */
#define VECTOR_BLOCK_SIZE_PARAM_ID 6
#define VECTOR_BLOCK_SIZE_DEFAULT 128
#define VECTOR_BLOCK_SIZE_PARAM_ID 6
#define VECTOR_BLOCK_SIZE_DEFAULT 128
#define VECTOR_PRUNING_ALPHA_PARAM_ID 7
#define VECTOR_PRUNING_ALPHA_DEFAULT 1.2
#define VECTOR_PRUNING_ALPHA_PARAM_ID 7
#define VECTOR_PRUNING_ALPHA_DEFAULT 1.2
#define VECTOR_INSERT_L_PARAM_ID 8
#define VECTOR_INSERT_L_DEFAULT 70
#define VECTOR_INSERT_L_PARAM_ID 8
#define VECTOR_INSERT_L_DEFAULT 70
#define VECTOR_SEARCH_L_PARAM_ID 9
#define VECTOR_SEARCH_L_DEFAULT 200
#define VECTOR_SEARCH_L_PARAM_ID 9
#define VECTOR_SEARCH_L_DEFAULT 200
#define VECTOR_MAX_NEIGHBORS_PARAM_ID 10
#define VECTOR_MAX_NEIGHBORS_PARAM_ID 10
#define VECTOR_COMPRESS_NEIGHBORS_PARAM_ID 11
/* total amount of vector index parameters */
#define VECTOR_PARAM_IDS_COUNT 9
#define VECTOR_PARAM_IDS_COUNT 11
/*
* Vector index parameters are stored in simple binary format (1 byte tag + 8 byte u64 integer / f64 float)
@ -218,7 +220,7 @@ int vectorOutRowsPut(VectorOutRows *, int, int, const u64 *, sqlite3_value *);
void vectorOutRowsGet(sqlite3_context *, const VectorOutRows *, int, int);
void vectorOutRowsFree(sqlite3 *, VectorOutRows *);
int diskAnnCreateIndex(sqlite3 *, const char *, const char *, const VectorIdxKey *, VectorIdxParams *);
int diskAnnCreateIndex(sqlite3 *, const char *, const char *, const VectorIdxKey *, VectorIdxParams *, const char **);
int diskAnnClearIndex(sqlite3 *, const char *, const char *);
int diskAnnDropIndex(sqlite3 *, const char *, const char *);
int diskAnnOpenIndex(sqlite3 *, const char *, const char *, const VectorIdxParams *, DiskAnnIndex **);

@ -24,6 +24,7 @@ typedef u32 VectorDims;
*/
#define VECTOR_TYPE_FLOAT32 1
#define VECTOR_TYPE_FLOAT64 2
#define VECTOR_TYPE_1BIT 3
#define VECTOR_FLAGS_STATIC 1
@ -48,8 +49,9 @@ void vectorInit(Vector *, VectorType, VectorDims, void *);
* Dumps vector on the console (used only for debugging)
*/
void vectorDump (const Vector *v);
void vectorF32Dump(const Vector *v);
void vectorF64Dump(const Vector *v);
void vectorF32Dump (const Vector *v);
void vectorF64Dump (const Vector *v);
void vector1BitDump(const Vector *v);
/*
* Converts vector to the text representation and write the result to the sqlite3_context
@ -61,9 +63,10 @@ void vectorF64MarshalToText(sqlite3_context *, const Vector *);
/*
* Serializes vector to the blob in little-endian format according to the IEEE-754 standard
*/
size_t vectorSerializeToBlob (const Vector *, unsigned char *, size_t);
size_t vectorF32SerializeToBlob(const Vector *, unsigned char *, size_t);
size_t vectorF64SerializeToBlob(const Vector *, unsigned char *, size_t);
size_t vectorSerializeToBlob (const Vector *, unsigned char *, size_t);
size_t vectorF32SerializeToBlob (const Vector *, unsigned char *, size_t);
size_t vectorF64SerializeToBlob (const Vector *, unsigned char *, size_t);
size_t vector1BitSerializeToBlob(const Vector *, unsigned char *, size_t);
/*
* Calculates cosine distance between two vectors (vector must have same type and same dimensions)
@ -72,6 +75,11 @@ float vectorDistanceCos (const Vector *, const Vector *);
float vectorF32DistanceCos (const Vector *, const Vector *);
double vectorF64DistanceCos(const Vector *, const Vector *);
/*
* Calculates hamming distance between two 1-bit vectors (vector must have same dimensions)
*/
int vector1BitDistanceHamming(const Vector *, const Vector *);
/*
* Calculates L2 distance between two vectors (vector must have same type and same dimensions)
*/

@ -437,10 +437,11 @@ int diskAnnCreateIndex(
const char *zDbSName,
const char *zIdxName,
const VectorIdxKey *pKey,
VectorIdxParams *pParams
VectorIdxParams *pParams,
const char **pzErrMsg
){
int rc;
int type, dims;
int type, dims, metric, neighbours;
u64 maxNeighborsParam, blockSizeBytes;
char *zSql;
char columnSqlDefs[VECTOR_INDEX_SQL_RENDER_LIMIT]; // definition of columns (e.g. index_key INTEGER BINARY, index_key1 TEXT, ...)
@ -477,11 +478,19 @@ int diskAnnCreateIndex(
if( vectorIdxParamsPutU64(pParams, VECTOR_BLOCK_SIZE_PARAM_ID, MAX(256, blockSizeBytes)) != 0 ){
return SQLITE_ERROR;
}
if( vectorIdxParamsGetU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID) == 0 ){
if( vectorIdxParamsPutU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID, VECTOR_METRIC_TYPE_COS) != 0 ){
metric = vectorIdxParamsGetU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID);
if( metric == 0 ){
metric = VECTOR_METRIC_TYPE_COS;
if( vectorIdxParamsPutU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID, metric) != 0 ){
return SQLITE_ERROR;
}
}
neighbours = vectorIdxParamsGetU64(pParams, VECTOR_COMPRESS_NEIGHBORS_PARAM_ID);
if( neighbours == VECTOR_TYPE_1BIT && metric != VECTOR_METRIC_TYPE_COS ){
*pzErrMsg = "1-bit compression available only for cosine metric";
return SQLITE_ERROR;
}
if( vectorIdxParamsGetF64(pParams, VECTOR_PRUNING_ALPHA_PARAM_ID) == 0 ){
if( vectorIdxParamsPutF64(pParams, VECTOR_PRUNING_ALPHA_PARAM_ID, VECTOR_PRUNING_ALPHA_DEFAULT) != 0 ){
return SQLITE_ERROR;
@ -1544,8 +1553,8 @@ int diskAnnOpenIndex(
pIndex->nEdgeVectorType = pIndex->nNodeVectorType;
pIndex->nEdgeVectorSize = pIndex->nNodeVectorSize;
}else if( compressNeighbours == VECTOR_TYPE_1BIT ){
pIndex->nEdgeVectorType = VECTOR_TYPE_1BIT;
pIndex->nEdgeVectorSize = vectorDataSize(VECTOR_TYPE_1BIT, pIndex->nVectorDims);
pIndex->nEdgeVectorType = compressNeighbours;
pIndex->nEdgeVectorSize = vectorDataSize(compressNeighbours, pIndex->nVectorDims);
}else{
return SQLITE_ERROR;
}

@ -468,6 +468,7 @@ set flist {
json.c
vector.c
vector1bit.c
vectordiskann.c
vectorfloat32.c
vectorfloat64.c